-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathPlayers2.py
260 lines (204 loc) · 11.6 KB
/
Players2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# -*- coding: utf-8 -*-
"""
@author: Wenbo Wang
[Wang2020] Wenbo Wang, Amir Leshem, Dusit Niyato and Zhu Han, "Decentralized Learning for Channel
Allocation inIoT Networks over Unlicensed Bandwidth as aContextual Multi-player Multi-armed Bandit Game"
License:
This program is licensed under the GPLv2 license. If you in any way use this
code for research that results in publications, please cite our original
article listed above.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
"""
# This file defines the player behavior for the specific SOC-MP-MAB algorithms (SOC in MABAlgorithms2.py)
# see also Players.py for other algorithms
__author__ = "Wenbo Wang"
import numpy as np
from Players import Player
if __name__ == '__main__':
print("Warning: this script 'PlayerResult2.py' is NOT executable..") # DEBUG
exit(0)
class SOCPlayer(Player):
def __init__(self, param):
"""
SOCPlayer is the player for the algorithm "stable orthogonal allocation (SOC)" proposed in
"Multi-player multi-armed bandits for stable allocation in heterogeneous ad-hoc networks", IEEE JSAC oct. 2019,
Sumit J. Darak and Manjesh K. Hanawal [Sumit2019].
The algorithm is featured by a protocol explicitly resolving collisions with channel switching,
and the channel statistics (index) is learned based on upper confidence bound (UCB).
Channel allocation is obtained through a master-slave allocation process. Social optimality is not guaranteed.
"""
self.nbArm = param["nbArm"]
#for arm of a specific context-player
self.playerID = param["playerID"]
self.flag_master = False
self.master_collision = np.zeros(2)
self.flag_lock = False
self.selected_arm = 0
self.policy = -1 # set to an invalid value
self.time = 0
self.accumulated_value = np.zeros(self.nbArm)
self.arm_score = np.zeros(self.nbArm) # for UCB score computation
self.nb_observation = np.zeros(self.nbArm) # number of observed non-zero payoff
self.ranked_armIDs = np.array(list(range(0, self.nbArm))) #ranked according to UCB score
self.flag_agree_switching = 0 # a 3-state flag, -1: not agree, 0: irrelavent, 1: agree
def reset(self):
self.flag_master = False
self.flag_lock = False
self.flag_agree_switching = 0
self.selected_arm = 0
self.policy = -1 # set to an invalid value
self.time = 0
self.accumulated_value = np.zeros(self.nbArm)
self.arm_score = np.zeros(self.nbArm) # for UCB score computation
self.nb_observation = np.zeros(self.nbArm) # number of observed non-zero payoff
self.ranked_armIDs = np.array(list(range(0, self.nbArm))) #ranked according to UCB score
self.master_collision[:] = 0
# --- functionalities
def explore(self, context = None, time = None):
"""
explore() is equivalent to the algorithm "Random Hopping" in [Sumit2019],
it allows users to orthogonalize on channels through uniformly drawing action samples at random
flag_lock has to be set after observing the collision feedback
"""
if self.flag_lock == True:
# choose the same action, do nothing
if self.policy == -1:
self.policy = self.selected_arm
else:
self.selected_arm = np.random.randint(self.nbArm)
return self.selected_arm
def learn_arm_value(self, context = None, arm_values = None, collisions = None):
# UCB score
if self.flag_agree_switching == -1:
# no arm is selected, this case happens when a slave node evacuates a channel
# to notify the master that it won't switch
pass
elif collisions[self.selected_arm] == 1:
# no collisions
self.flag_lock = True
self.time = self.time + 1 # only increment when a good sample is obtained
armID = self.selected_arm
self.nb_observation[armID] = self.nb_observation[armID] + 1
self.accumulated_value[armID] = self.accumulated_value[armID] + arm_values[armID]
# update UCB Scores
self.arm_score = self.accumulated_value / (self.nb_observation+1e-9) + np.sqrt(2*self.time / (self.nb_observation+1e-9))
# get the preference
self.ranked_armIDs = np.argsort(-self.arm_score)
def exploit(self, context=None, time=None):
# SOC doesn't have a clear phase of exploitation, the players uses a collision avoidance-like
# protocol to explicitly allocate the channels among players
assert self.policy != -1, "policy is not obtained"
self.selected_arm = self.policy
return self.selected_arm
def set_master(self, MB_id):
assert self.flag_lock == True, "the channel is not locked yet"
# check if the MB_id is currently self.selected_arm
if self.policy == MB_id:
# print("set_master(): master node ID {} at MB {}".format(self.playerID, MB_id)) # debugging
self.flag_master = True
# reset the recorder
self.master_collision[:] = 0
else:
# print("set_master: slave node ID {} at MB {}".format( self.playerID, MB_id)) # debugging
self.flag_master = False
return self.flag_master
def set_master_action(self, SB_id):
"""
set the action of the master node (as the channel indicated by the current block ID)
"""
assert self.flag_lock == True, "the channel is not locked yet"
assert self.flag_master == True, "not a master node"
assert self.policy == self.selected_arm, "action not aligned to policy"
# get the ranked_arms without self.selected_arm
tmp_arm_rank = np.ndarray.tolist(self.ranked_armIDs)
# see footnote 1 of [Sumit2019]
current_arm_rank = tmp_arm_rank.index(self.selected_arm)
tmp_arm_rank.pop(current_arm_rank)
if SB_id - 1 < current_arm_rank:
# print("Master ID-{}: av-{:.2} ---> av-{:.2}".format(self.playerID, self.arm_score[self.selected_arm],
# self.arm_score[tmp_arm_rank[SB_id - 1]])) # debugging
master_arm_choice = tmp_arm_rank[SB_id-1]
else:
master_arm_choice = self.selected_arm
# set policy to the currently reserved channel, signal over the new channel
self.policy = self.selected_arm
self.selected_arm = master_arm_choice
return self.selected_arm, self.policy # new, old (MB)
def decide_switching(self, subslot_id, target_arm=None):
# has to be called by a slave
assert self.flag_lock == True, "the channel is not locked yet"
assert self.flag_master == False, "not a slave node."
assert self.policy != -1, "policy is not set"
if subslot_id == 0:
# it is in a channel transmit (CT) sub-slot
assert target_arm is not None, "master arm choice not set"
if target_arm != self.selected_arm:
# not requested and do nothing
self.flag_agree_switching = 0 # not requested
# print("Slave ID-{}: not requested {} ---> {}".format(self.playerID, self.selected_arm, target_arm)) # debugging
else:
arm_rank_list = np.ndarray.tolist(self.ranked_armIDs)
current_arm_rank = arm_rank_list.index(self.selected_arm)
requested_arm_rank = arm_rank_list.index(target_arm)
# print("Slave ID-{}: av-{:.2} ---> av-{:.2}".format(self.playerID, self.arm_score[self.selected_arm],
# self.arm_score[target_arm])) # debugging
if requested_arm_rank < current_arm_rank:
# if master_arm_choice has a higher score, switch
self.flag_agree_switching = 1 # agreed
self.selected_arm = self.policy # choose the currently preferred arm
self.policy = target_arm # update policy
# print("UE-{} agrees: CH-{} to CH-{} w/ scores: {} to {}".format(self.playerID,
# self.selected_arm, target_arm, arm_rank_list[current_arm_rank], arm_rank_list[requested_arm_rank])) # debugging
else:
# if master_arm_choice is worse than the current arm, refuse switching
self.flag_agree_switching = -1 # refused
# no change to policy
self.selected_arm = self.policy
else:
# it is in a channel switch (CS) sub-slot
if self.flag_agree_switching == -1:
# refuse swithcing, leave the channel for one slot
self.selected_arm = -1
else:
# if self.flag_agree_switching == 1: # agree to switch, stay on the channel to collide
# if self.flag_agree_switching == 0: # not requested to switch, stay on the channel
# transmit on the same channel or not affected
pass
return self.selected_arm
def update_policy(self, subslot_id, collisions):
# the original paper does not specify when to stop updating the arm-value estimation
# so we aussme that it never stops
assert self.flag_lock == True, "the channel is not locked yet"
# update actions
if subslot_id == 0:
# only update the master in CS slot
if self.flag_master == True:
self.master_collision[0] = collisions[self.selected_arm]
else:
pass
elif subslot_id == 1:
if self.flag_master == True:
self.master_collision[1] = collisions[self.selected_arm]
#update policy and action, according to Fig.2 [Sumit2019]
if self.master_collision[0] > 1 and self.master_collision[1] > 1: # senario 1 (colliding twice):
# switching allowed
self.policy = self.selected_arm
# print("Master ID-{}: policy updated w/ switching".format(self.playerID)) # debugging
elif self.master_collision[0] == 1 and self.master_collision[1] == 1: # senario 3 (no collision, twice)
self.policy = self.selected_arm
# print("Master ID-{}: policy updated for vacant channel".format(self.playerID)) # debugging
else:
# roll back
# print("Master ID-{}: policy rolled back".format(self.playerID)) # debugging
pass
# reset the recorder
self.master_collision[:] = 0
else:
# reset flag to "not requested"
self.flag_agree_switching = 0
else:
raise Exception("invalid sub-slot ID.")
self.selected_arm = self.policy