|
21 | 21 | # SOFTWARE.
|
22 | 22 |
|
23 | 23 |
|
24 |
| -from abc import ABC, abstractmethod |
25 |
| -from typing import Any, Dict, List, NewType, Optional, Set, Tuple, Union |
| 24 | +from typing import Dict, List, NewType, Tuple, Union |
26 | 25 |
|
27 |
| -import numpy as np |
28 |
| -from pydantic import ( |
29 |
| - BaseModel, |
30 |
| - NonNegativeInt, |
31 |
| - confloat, |
32 |
| - conint, |
33 |
| - constr, |
34 |
| - field_validator, |
35 |
| - model_validator, |
36 |
| - validate_call, |
37 |
| -) |
| 26 | +from pydantic import BaseModel, confloat, conint, constr |
38 | 27 |
|
39 | 28 | ActionId = NewType("ActionId", constr(min_length=1))
|
40 | 29 | Float01 = NewType("Float_0_1", confloat(ge=0, le=1))
|
41 | 30 | Probability = NewType("Probability", Float01)
|
42 |
| -Predictions = NewType("Predictions", Tuple[List[ActionId], List[Dict[ActionId, Probability]]]) |
| 31 | +SmabPredictions = NewType("SmabPredictions", Tuple[List[ActionId], List[Dict[ActionId, Probability]]]) |
| 32 | +CmabPredictions = NewType( |
| 33 | + "CmabPredictions", Tuple[List[ActionId], List[Dict[ActionId, Probability]], List[Dict[ActionId, float]]] |
| 34 | +) |
| 35 | +Predictions = NewType("Predictions", Union[SmabPredictions, CmabPredictions]) |
43 | 36 | BinaryReward = NewType("BinaryReward", conint(ge=0, le=1))
|
| 37 | +ActionRewardLikelihood = NewType( |
| 38 | + "ActionRewardLikelihood", |
| 39 | + Union[Dict[ActionId, float], Dict[ActionId, Probability], Dict[ActionId, List[Probability]]], |
| 40 | +) |
| 41 | +ACTION_IDS_PREFIX = "action_ids_" |
44 | 42 |
|
45 | 43 |
|
46 | 44 | class PyBanditsBaseModel(BaseModel, extra="forbid"):
|
47 | 45 | """
|
48 | 46 | BaseModel of the PyBandits library.
|
49 | 47 | """
|
50 |
| - |
51 |
| - |
52 |
| -class Model(PyBanditsBaseModel, ABC): |
53 |
| - """ |
54 |
| - Class to model the prior distributions. |
55 |
| - """ |
56 |
| - |
57 |
| - @abstractmethod |
58 |
| - def sample_proba(self) -> Probability: |
59 |
| - """ |
60 |
| - Sample the probability of getting a positive reward. |
61 |
| - """ |
62 |
| - |
63 |
| - @abstractmethod |
64 |
| - def update(self, rewards: List[Any]): |
65 |
| - """ |
66 |
| - Update the model parameters. |
67 |
| - """ |
68 |
| - |
69 |
| - |
70 |
| -class Strategy(PyBanditsBaseModel, ABC): |
71 |
| - """ |
72 |
| - Strategy to select actions in multi-armed bandits. |
73 |
| - """ |
74 |
| - |
75 |
| - @abstractmethod |
76 |
| - def select_action(self, p: Dict[ActionId, Probability], actions: Optional[Dict[ActionId, Model]]) -> ActionId: |
77 |
| - """ |
78 |
| - Select the action. |
79 |
| - """ |
80 |
| - |
81 |
| - |
82 |
| -class BaseMab(PyBanditsBaseModel, ABC): |
83 |
| - """ |
84 |
| - Multi-armed bandit superclass. |
85 |
| -
|
86 |
| - Parameters |
87 |
| - ---------- |
88 |
| - actions: Dict[ActionId, Model] |
89 |
| - The list of possible actions, and their associated Model. |
90 |
| - strategy: Strategy |
91 |
| - The strategy used to select actions. |
92 |
| - epsilon: Optional[Float01] |
93 |
| - The probability of selecting a random action. |
94 |
| - default_action: Optional[ActionId] |
95 |
| - The default action to select with a probability of epsilon when using the epsilon-greedy approach. |
96 |
| - If `default_action` is None, a random action from the action set will be selected with a probability of epsilon. |
97 |
| - """ |
98 |
| - |
99 |
| - actions: Dict[ActionId, Model] |
100 |
| - strategy: Strategy |
101 |
| - epsilon: Optional[Float01] |
102 |
| - default_action: Optional[ActionId] |
103 |
| - |
104 |
| - @field_validator("actions", mode="before") |
105 |
| - @classmethod |
106 |
| - def at_least_2_actions_are_defined(cls, v): |
107 |
| - # validate that at least 2 actions are defined |
108 |
| - if len(v) < 2: |
109 |
| - raise AttributeError("At least 2 actions should be defined.") |
110 |
| - # validate that all actions are of the same configuration |
111 |
| - action_models = list(v.values()) |
112 |
| - first_action = action_models[0] |
113 |
| - first_action_type = type(first_action) |
114 |
| - if any(not isinstance(action, first_action_type) for action in action_models[1:]): |
115 |
| - raise AttributeError("All actions should follow the same type.") |
116 |
| - |
117 |
| - return v |
118 |
| - |
119 |
| - @model_validator(mode="after") |
120 |
| - def check_default_action(self): |
121 |
| - if not self.epsilon and self.default_action: |
122 |
| - raise AttributeError("A default action should only be defined when epsilon is defined.") |
123 |
| - if self.default_action and self.default_action not in self.actions: |
124 |
| - raise AttributeError("The default action should be defined in the actions.") |
125 |
| - return self |
126 |
| - |
127 |
| - def _get_valid_actions(self, forbidden_actions: Optional[Set[ActionId]]) -> Set[ActionId]: |
128 |
| - """ |
129 |
| - Given a set of forbidden action IDs, return a set of valid action IDs. |
130 |
| -
|
131 |
| - Parameters |
132 |
| - ---------- |
133 |
| - forbidden_actions: Optional[Set[ActionId]] |
134 |
| - The set of forbidden action IDs. |
135 |
| -
|
136 |
| - Returns |
137 |
| - ------- |
138 |
| - valid_actions: Set[ActionId] |
139 |
| - The list of valid (i.e. not forbidden) action IDs. |
140 |
| - """ |
141 |
| - if forbidden_actions is None: |
142 |
| - forbidden_actions = set() |
143 |
| - |
144 |
| - if not all(a in self.actions.keys() for a in forbidden_actions): |
145 |
| - raise ValueError("forbidden_actions contains invalid action IDs.") |
146 |
| - valid_actions = set(self.actions.keys()) - forbidden_actions |
147 |
| - if len(valid_actions) == 0: |
148 |
| - raise ValueError("All actions are forbidden. You must allow at least 1 action.") |
149 |
| - if self.default_action and self.default_action not in valid_actions: |
150 |
| - raise ValueError("The default action is forbidden.") |
151 |
| - |
152 |
| - return valid_actions |
153 |
| - |
154 |
| - def _check_update_params(self, actions: List[ActionId], rewards: List[Union[NonNegativeInt, List[NonNegativeInt]]]): |
155 |
| - """ |
156 |
| - Verify that the given list of action IDs is a subset of the currently defined actions. |
157 |
| -
|
158 |
| - Parameters |
159 |
| - ---------- |
160 |
| - actions : List[ActionId] |
161 |
| - The selected action for each sample. |
162 |
| - rewards: List[Union[BinaryReward, List[BinaryReward]]] |
163 |
| - The reward for each sample. |
164 |
| - """ |
165 |
| - invalid = set(actions) - set(self.actions.keys()) |
166 |
| - if invalid: |
167 |
| - raise AttributeError(f"The following invalid action(s) were specified: {invalid}.") |
168 |
| - if len(actions) != len(rewards): |
169 |
| - raise AttributeError(f"Shape mismatch: actions and rewards should have the same length {len(actions)}.") |
170 |
| - |
171 |
| - @abstractmethod |
172 |
| - @validate_call |
173 |
| - def update(self, actions: List[ActionId], rewards: List[Union[BinaryReward, List[BinaryReward]]], *args, **kwargs): |
174 |
| - """ |
175 |
| - Update the stochastic multi-armed bandit model. |
176 |
| -
|
177 |
| - actions: List[ActionId] |
178 |
| - The selected action for each sample. |
179 |
| - rewards: List[Union[BinaryReward, List[BinaryReward]]] |
180 |
| - The reward for each sample. |
181 |
| - """ |
182 |
| - |
183 |
| - @abstractmethod |
184 |
| - @validate_call |
185 |
| - def predict(self, forbidden_actions: Optional[Set[ActionId]] = None): |
186 |
| - """ |
187 |
| - Predict actions. |
188 |
| -
|
189 |
| - Parameters |
190 |
| - ---------- |
191 |
| - forbidden_actions : Optional[Set[ActionId]], default=None |
192 |
| - Set of forbidden actions. If specified, the model will discard the forbidden_actions and it will only |
193 |
| - consider the remaining allowed_actions. By default, the model considers all actions as allowed_actions. |
194 |
| - Note that: actions = allowed_actions U forbidden_actions. |
195 |
| -
|
196 |
| - Returns |
197 |
| - ------- |
198 |
| - actions: List[ActionId] of shape (n_samples,) |
199 |
| - The actions selected by the multi-armed bandit model. |
200 |
| - probs: List[Dict[ActionId, float]] of shape (n_samples,) |
201 |
| - The probabilities of getting a positive reward for each action. |
202 |
| - """ |
203 |
| - |
204 |
| - def get_state(self) -> (str, dict): |
205 |
| - """ |
206 |
| - Access the complete model internal state, enough to create an exact copy of the same model from it. |
207 |
| - Returns |
208 |
| - ------- |
209 |
| - model_class_name: str |
210 |
| - The name of the class of the model. |
211 |
| - model_state: dict |
212 |
| - The internal state of the model (actions, scores, etc.). |
213 |
| - """ |
214 |
| - model_name = self.__class__.__name__ |
215 |
| - state: dict = self.dict() |
216 |
| - return model_name, state |
217 |
| - |
218 |
| - @validate_call |
219 |
| - def _select_epsilon_greedy_action( |
220 |
| - self, |
221 |
| - p: Union[Dict[ActionId, float], Dict[ActionId, Probability], Dict[ActionId, List[Probability]]], |
222 |
| - actions: Optional[Dict[ActionId, Model]] = None, |
223 |
| - ) -> ActionId: |
224 |
| - """ |
225 |
| - Wraps self.strategy.select_action function with epsilon-greedy strategy, |
226 |
| - such that with probability epsilon a default_action is selected, |
227 |
| - and with probability 1-epsilon the select_action function is triggered to choose action. |
228 |
| - If no default_action is provided, a random action is selected. |
229 |
| -
|
230 |
| - Reference: Reinforcement Learning: An Introduction, Ch. 2 (Sutton and Burto, 2018) |
231 |
| - https://web.stanford.edu/class/psych209/Readings/SuttonBartoIPRLBook2ndEd.pdf&ved=2ahUKEwjMy8WV9N2HAxVe0gIHHVjjG5sQFnoECEMQAQ&usg=AOvVaw3bKK-Y_1kf6XQVwR-UYrBY |
232 |
| -
|
233 |
| - Parameters |
234 |
| - ---------- |
235 |
| - p: Union[Dict[ActionId, float], Dict[ActionId, Probability], Dict[ActionId, List[Probability]]] |
236 |
| - The dictionary or actions and their sampled probability of getting a positive reward. |
237 |
| - For MO strategy, the sampled probability is a list with elements corresponding to the objectives. |
238 |
| - actions: Optional[Dict[ActionId, Model]] |
239 |
| - The dictionary of actions and their associated Model. |
240 |
| -
|
241 |
| - Returns |
242 |
| - ------- |
243 |
| - selected_action: ActionId |
244 |
| - The selected action. |
245 |
| -
|
246 |
| - Raises |
247 |
| - ------ |
248 |
| - KeyError |
249 |
| - If self.default_action is not present as a key in the probabilities dictionary. |
250 |
| - """ |
251 |
| - |
252 |
| - if self.epsilon: |
253 |
| - if self.default_action and self.default_action not in p.keys(): |
254 |
| - raise KeyError(f"Default action {self.default_action} not in actions.") |
255 |
| - if np.random.binomial(1, self.epsilon): |
256 |
| - selected_action = self.default_action if self.default_action else np.random.choice(list(p.keys())) |
257 |
| - else: |
258 |
| - selected_action = self.strategy.select_action(p=p, actions=actions) |
259 |
| - else: |
260 |
| - selected_action = self.strategy.select_action(p=p, actions=actions) |
261 |
| - return selected_action |
0 commit comments