PlaytikaOSS
diff --git a/‎pybandits/base.py
Lines changed: 49 additions & 1 deletion b/‎pybandits/base.py
Lines changed: 49 additions & 1 deletion
diff --git a/‎pybandits/cmab.py
Lines changed: 53 additions & 11 deletions b/‎pybandits/cmab.py
Lines changed: 53 additions & 11 deletions
@@ -22,8 +22,9 @@
 
 
 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, NewType, Optional, Set, Tuple, Union
+from typing import Any, Dict, List, NewType, Optional, Set, Tuple, Union, Callable
 
+import numpy as np
 from pydantic import (
     BaseModel,
     Extra,
@@ -33,8 +34,10 @@
     constr,
     validate_arguments,
     validator,
+    root_validator,
 )
 
+
 ActionId = NewType("ActionId", constr(min_length=1))
 Float01 = NewType("Float_0_1", confloat(ge=0, le=1))
 Probability = NewType("Probability", Float01)
@@ -74,6 +77,47 @@ class Strategy(PyBanditsBaseModel, ABC):
     Strategy to select actions in multi-armed bandits.
     """
 
+    epsilon: Optional[Float01]
+    default_action: Optional[ActionId]
+
+    def __init__(self, **data: Any) -> None:
+        super().__init__(**data)
+        if self.epsilon is not None:
+            self.select_action = self._make_epsilon_greedy(self.epsilon, self.default_action, self.select_action)
+
+    @staticmethod
+    @validate_arguments
+    def _make_epsilon_greedy(epsilon: Float01, default_action: Optional[ActionId], select_action: Callable):
+        """
+        Wraps a select_action function with epsilon-greedy strategy.
+
+        Parameters
+        ----------
+        epsilon: Float_0_1
+            Number in [0, 1] which specifies the probability of selecting a default_action.
+        default_action: Optional[ActionId]
+            The default action to be selected with probability epsilon. None for random action.
+        select_action
+            The function to be wrapped.
+
+        Returns
+        -------
+        select_epsilon_greedy_action: Callable
+            The wrapped function.
+        """
+
+        def select_epsilon_greedy_action(
+            p: Dict[ActionId, float],
+            actions: Optional[Dict[ActionId, Model]] = None,
+        ):
+            if default_action and default_action not in p.keys():
+                raise KeyError(f"Default action {default_action} not in actions.")
+            if np.random.binomial(1, epsilon):
+                return default_action if default_action else np.random.choice(list(p.keys()))
+            return select_action(p=p, actions=actions)
+
+        return select_epsilon_greedy_action
+
     @abstractmethod
     def select_action(self, p: Dict[ActionId, Probability], actions: Optional[Dict[ActionId, Model]]) -> ActionId:
         """
@@ -91,10 +135,14 @@ class BaseMab(PyBanditsBaseModel, ABC):
         The list of possible actions, and their associated Model.
     strategy: Strategy
         The strategy used to select actions.
+    epsilon: Optional[Float01], defaults to 0
+        The probability of selecting a random action.
     """
 
     actions: Dict[ActionId, Model]
     strategy: Strategy
+    epsilon: Optional[Float01] = None
+    default_action: Optional[ActionId]
 
     @validator("actions", pre=True)
     @classmethod
 
@@ -212,8 +212,13 @@ class CmabBernoulli(BaseCmabBernoulli):
     predict_with_proba: bool = False
     predict_actions_randomly: bool = False
 
-    def __init__(self, actions: Dict[ActionId, BaseBayesianLogisticRegression]):
-        super().__init__(actions=actions, strategy=ClassicBandit())
+    def __init__(
+        self,
+        actions: Dict[ActionId, BaseBayesianLogisticRegression],
+        epsilon: Optional[Float01] = None,
+        default_action: Optional[ActionId] = None,
+    ):
+        super().__init__(actions=actions, strategy=ClassicBandit(), epsilon=epsilon, default_action=default_action)
 
     @classmethod
     def from_state(cls, state: dict) -> "CmabBernoulli":
@@ -249,9 +254,15 @@ class CmabBernoulliBAI(BaseCmabBernoulli):
     predict_with_proba: bool = False
     predict_actions_randomly: bool = False
 
-    def __init__(self, actions: Dict[ActionId, BayesianLogisticRegression], exploit_p: Optional[Float01] = None):
+    def __init__(
+        self,
+        actions: Dict[ActionId, BayesianLogisticRegression],
+        epsilon: Optional[Float01] = None,
+        default_action: Optional[ActionId] = None,
+        exploit_p: Optional[Float01] = None,
+    ):
         strategy = BestActionIdentification() if exploit_p is None else BestActionIdentification(exploit_p=exploit_p)
-        super().__init__(actions=actions, strategy=strategy)
+        super().__init__(actions=actions, strategy=strategy, epsilon=epsilon, default_action=default_action)
 
     @classmethod
     def from_state(cls, state: dict) -> "CmabBernoulliBAI":
@@ -296,9 +307,15 @@ class CmabBernoulliCC(BaseCmabBernoulli):
     predict_with_proba: bool = True
     predict_actions_randomly: bool = False
 
-    def __init__(self, actions: Dict[ActionId, BayesianLogisticRegressionCC], subsidy_factor: Optional[Float01] = None):
+    def __init__(
+        self,
+        actions: Dict[ActionId, BayesianLogisticRegressionCC],
+        epsilon: Optional[Float01] = None,
+        default_action: Optional[ActionId] = None,
+        subsidy_factor: Optional[Float01] = None,
+    ):
         strategy = CostControlBandit() if subsidy_factor is None else CostControlBandit(subsidy_factor=subsidy_factor)
-        super().__init__(actions=actions, strategy=strategy)
+        super().__init__(actions=actions, strategy=strategy, epsilon=epsilon, default_action=default_action)
 
     @classmethod
     def from_state(cls, state: dict) -> "CmabBernoulliCC":
@@ -310,7 +327,12 @@ def update(self, context: ArrayLike, actions: List[ActionId], rewards: List[Bina
 
 
 @validate_arguments
-def create_cmab_bernoulli_cold_start(action_ids: Set[ActionId], n_features: PositiveInt) -> CmabBernoulli:
+def create_cmab_bernoulli_cold_start(
+    action_ids: Set[ActionId],
+    n_features: PositiveInt,
+    epsilon: Optional[Float01] = None,
+    default_action: Optional[ActionId] = None,
+) -> CmabBernoulli:
     """
     Utility function to create a Contextual Bernoulli Multi-Armed Bandit with Thompson Sampling, with default
     parameters. Until the very first update the model will predict actions randomly, where each action has equal
@@ -323,6 +345,10 @@ def create_cmab_bernoulli_cold_start(action_ids: Set[ActionId], n_features: Posi
     n_features: PositiveInt
         The number of features expected after in the context matrix. This is also the number of betas of the
         Bayesian Logistic Regression model.
+    epsilon: Optional[Float01]
+        epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
+    default_action: Optional[ActionId]
+        Default action to select if the epsilon-greedy approach is used. None for random selection.
     Returns
     -------
     cmab: CmabBernoulli
@@ -331,14 +357,18 @@ def create_cmab_bernoulli_cold_start(action_ids: Set[ActionId], n_features: Posi
     actions = {}
     for a in set(action_ids):
         actions[a] = create_bayesian_logistic_regression_cold_start(n_betas=n_features)
-    mab = CmabBernoulli(actions=actions)
+    mab = CmabBernoulli(actions=actions, epsilon=epsilon, default_action=default_action)
     mab.predict_actions_randomly = True
     return mab
 
 
 @validate_arguments
 def create_cmab_bernoulli_bai_cold_start(
-    action_ids: Set[ActionId], n_features: PositiveInt, exploit_p: Optional[Float01] = None
+    action_ids: Set[ActionId],
+    n_features: PositiveInt,
+    exploit_p: Optional[Float01] = None,
+    epsilon: Optional[Float01] = None,
+    default_action: Optional[ActionId] = None,
 ) -> CmabBernoulliBAI:
     """
     Utility function to create a Contextual Bernoulli Multi-Armed Bandit with Thompson Sampling, and Best Action
@@ -361,6 +391,10 @@ def create_cmab_bernoulli_bai_cold_start(
             (it behaves as a Greedy strategy).
         If exploit_p is 0, the bandits always select the action with 2nd highest probability of getting a positive
             reward.
+    epsilon: Optional[Float01]
+        epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
+    default_action: Optional[ActionId]
+        Default action to select if the epsilon-greedy approach is used. None for random selection.
 
     Returns
     -------
@@ -370,7 +404,7 @@ def create_cmab_bernoulli_bai_cold_start(
     actions = {}
     for a in set(action_ids):
         actions[a] = create_bayesian_logistic_regression_cold_start(n_betas=n_features)
-    mab = CmabBernoulliBAI(actions=actions, exploit_p=exploit_p)
+    mab = CmabBernoulliBAI(actions=actions, exploit_p=exploit_p, epsilon=epsilon, default_action=default_action)
     mab.predict_actions_randomly = True
     return mab
 
@@ -380,6 +414,8 @@ def create_cmab_bernoulli_cc_cold_start(
     action_ids_cost: Dict[ActionId, NonNegativeFloat],
     n_features: PositiveInt,
     subsidy_factor: Optional[Float01] = None,
+    epsilon: Optional[Float01] = None,
+    default_action: Optional[ActionId] = None,
 ) -> CmabBernoulliCC:
     """
     Utility function to create a Stochastic Bernoulli Multi-Armed Bandit with Thompson Sampling, and Cost Control
@@ -408,6 +444,10 @@ def create_cmab_bernoulli_cc_cold_start(
         If subsidy_factor is 1, the bandits always selects the action with the minimum cost.
         If subsidy_factor is 0, the bandits always selects the action with highest probability of getting a positive
             reward (it behaves as a classic Bernoulli bandit).
+    epsilon: Optional[Float01]
+        epsilon for epsilon-greedy approach. If None, epsilon-greedy is not used.
+    default_action: Optional[ActionId]
+        Default action to select if the epsilon-greedy approach is used. None for random selection.
 
     Returns
     -------
@@ -417,6 +457,8 @@ def create_cmab_bernoulli_cc_cold_start(
     actions = {}
     for a, cost in action_ids_cost.items():
         actions[a] = create_bayesian_logistic_regression_cc_cold_start(n_betas=n_features, cost=cost)
-    mab = CmabBernoulliCC(actions=actions, subsidy_factor=subsidy_factor)
+    mab = CmabBernoulliCC(
+        actions=actions, subsidy_factor=subsidy_factor, epsilon=epsilon, default_action=default_action
+    )
     mab.predict_actions_randomly = True
     return mab