-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPmlibTutorial.py
178 lines (120 loc) · 5.07 KB
/
PmlibTutorial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
# coding: utf-8
# In[1]:
import pmlib
import numpy as np
# In[2]:
# uncomment to list all pmlib functionalities
# dir(pmlib)
# In[3]:
# We create a PM instance for a stochatic 3-armed Bernoulli Multi-Armed Bandit (MAB)
# the parameters are the independent expected arm rewards
bandit = pmlib.BernoulliBandit([0.75,0.5,0.25])
# In[4]:
# show the PM game matrices and outcomes distributions
bandit.dump(plot=True, nice=True)
# note that for this example we have 3 actions (one for each arm)
# and 2^3 outcomes (one for each possible reward vector)
# In[ ]:
# We can also consider bandits with strongly correlated arms
# by specifying a for instance an outcome distribution
# where only two arms can be winning at the same time
bandit.OutcomeDist = np.array([0,0,0,1/3.,0,1/3.,1/3.,0])
bandit.dump(plot=True, nice=True)
# In[5]:
# Other well known instances of PM are dynamic pricing and Apple tasting
dp = pmlib.DynamicPricingPM([0.1,0.1,0.7,0.1], 2.)
at = pmlib.AppleTasting([0.05,0.95])
at.dump(nice=True)
# In[ ]:
# We can also create a dueling bandit instance where the action is a couple of arms
# and the feedback is the relative reward.
print "** Dueling bandit problem"
dueling = pmlib.BinaryUtilityDuelingBanditPM([0.75,0.5,0.5,0.25])
dueling.dump(plot=False,nice=True)
# In[6]:
# We provide a list of benchmark games settings:
# pmlib.benchmark_games
# the pmlib.problemClass() function computes the complexity class of any finite game in the PM hierarchy.
#
# It can be either:
# * trivial gives \Theta(1) minmax regret
# * easy gives \Theta(\sqrt(T)) minmax regret
# * hard gives \Theta(T^{2/3}) minmax regret
# * or intractable with a linear minmax regret
#
# (see Bartok et al. "Partial monitoring – classification, regret bounds, and algorithms" 2013)
#
# This function and many others are based on the python wrapper to the Parma Polyhedra Library (ppl)
# see http://bugseng.com/products/ppl/
# and https://pypi.python.org/pypi/pplpy/0.6
import ppl
# We can analyze all the games of the benchmark list:
for i in range(len(pmlib.benchmark_games)):
print
print
print "*****", pmlib.benchmark_names[i], "*****"
game = pmlib.benchmark_games[i]
game.dump(plot=False, nice = True) # set plot=True to plot the outcome distributions
# gives game hierarchy
hierarchy, why = pmlib.ProblemClass(game)
print
print
print "======> This game is", hierarchy.upper() + ", because", why
# In[9]:
# The present version only include 4 variants of FeedExp3 algorithm.
# The BasicFeedexp3 class only works with some forms of numeric feedback matrices where
# there exists an NxN matrix K such that L=KF (See Piccolboni & Schindelhauer 2000).
# The GeneralFeedexp3 uses Cell decomposition to handle more general forms of feedbacks.
# The eta and gamma parameters can be optimized for known horizon.
# When these parameters are set to zero we use an anytime version whith dynamic eta and gamma parameters
# as specified in (Cesa-Bianchi et al. 2006).
from multiprocessing import cpu_count
nbCores = max(1,cpu_count() - 2)
nbReps = nbCores*10
horizon = 10000
pm_game = pmlib.AppleTasting([0.05,0.95])
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [10.,7.]
eta, gamma = pmlib.optimal_Feedexp3_parameters(pm_game, horizon)
gfx3 = pmlib.GeneralFeedexp3(pm_game, eta , gamma)
cumRegrets1 = pmlib.eval_policy_parallel(nbCores, nbReps, horizon, pm_game, gfx3)
pmlib.init_plot("Average regret curve")
pmlib.plot_regret(cumRegrets1, mylabel= "General FeedExp3 (known horizon)", mycolor = 'green')
pmlib.show_plot()
# In[10]:
# Here is a generic plot function
def plot_game(pm_game):
# Feedexp3
print "** FeedExp3"
eta, gamma = pmlib.optimal_Feedexp3_parameters(pm_game, horizon)
gfx3 = pmlib.GeneralFeedexp3(pm_game, eta , gamma)
cumRegrets1 = pmlib.eval_policy_parallel(nbCores, nbReps, horizon, pm_game, gfx3)
#Rex3
if pm_game.game_type=="dueling":
print "** Rex3"
rex3 = pmlib.Rex3(pm_game, pmlib.optimal_gamma(pm_game, horizon))
cumRegrets2 = pmlib.eval_policy_parallel(nbCores, nbReps, horizon, pm_game, rex3)
# Random
print "** Random"
baseline = pmlib.BasicPolicy(pm_game)
cumRegrets3 = pmlib.eval_policy_parallel(nbCores, nbReps, horizon, pm_game, baseline)
pmlib.init_plot("Generic PM versus adhoc DB")
pmlib.plot_regret(cumRegrets1, mylabel= "General FeedExp3 (known horizon)", mycolor = 'black')
if pm_game.game_type=="dueling":
# It is an open question whether a general PM algorithm can be as tight as an
# adhoc dueling bandits algorithm.
pmlib.plot_regret(cumRegrets2, mylabel= "Rex3 (known horizon)", mycolor = 'orange')
pmlib.plot_regret(cumRegrets3, mylabel= "Random", mycolor = 'red', autoscale = False)
pmlib.show_plot()
# In[11]:
# We can also plot all these games (change horizon and nbReps at will)
horizon = 2000
nbReps = nbCores*10
for i in range(len(pmlib.benchmark_games)):
print
print
print "*****", pmlib.benchmark_names[i], "*****"
game = pmlib.benchmark_games[i]
print
plot_game(game)
# In[ ]: