feat: Stable pibo implementation

automl · Aug 30, 2024 · 5f76a7e · 5f76a7e
1 parent da4f376
commit 5f76a7e
Show file tree

Hide file tree

Showing 13 changed files with 210 additions and 102 deletions.
diff --git a/neps/optimizers/__init__.py b/neps/optimizers/__init__.py
@@ -19,7 +19,6 @@
     SuccessiveHalving,
     SuccessiveHalvingWithPriors,
 )
-from .multi_fidelity_prior.async_priorband import PriorBandAsha, PriorBandAshaHB
 from .multi_fidelity_prior.priorband import PriorBand
 from .random_search.optimizer import RandomSearch
 from .regularized_evolution.optimizer import RegularizedEvolution
@@ -29,8 +28,8 @@
 
 # TODO: Rename Searcher to Optimizer...
 SearcherMapping: Mapping[str, Callable[..., BaseOptimizer]] = {
-    "bayesian_optimization": BayesianOptimization,
-    "pibo": partial(BayesianOptimization, disable_priors=False),
+    "bayesian_optimization": partial(BayesianOptimization, use_priors=False),
+    "pibo": partial(BayesianOptimization, use_priors=True),
     "random_search": RandomSearch,
     "regularized_evolution": RegularizedEvolution,
     "assisted_regularized_evolution": partial(RegularizedEvolution, assisted=True),

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py b/neps/optimizers/bayesian_optimization/acquisition_functions/pibo.py
@@ -39,9 +39,15 @@ def apply_pibo_acquisition_weight(
     x_domain: Domain | list[Domain],
     prior_exponent: float,
 ):
+    import rich
+
+    rich.print(prior_exponent)
     if acq._log:
-        return acq_values + prior.log_prob(X, frm=x_domain) * prior_exponent
-    return acq_values * prior.prob(X, frm=x_domain).pow(prior_exponent)
+        weighted_log_probs = prior.log_prob(X, frm=x_domain) * prior_exponent
+        return acq_values + weighted_log_probs
+
+    weighted_probs = prior.prob(X, frm=x_domain).pow(prior_exponent)
+    return acq_values * weighted_probs
 
 
 def pibo_acquisition(

diff --git a/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py b/neps/optimizers/bayesian_optimization/acquisition_functions/weighted_acquisition.py
@@ -115,7 +115,7 @@ def __init__(
         # NOTE: We remove the X_pending from the base acquisition function as we will get
         # it in our own forward with `@concatenate_pending_points` and pass that forward.
         # This avoids possible duplicates
-        self.acq.set_X_pending(None)
+        acq.set_X_pending(None)
         self.set_X_pending(X_pending)
         self.apply_weight = apply_weight
         self.acq = acq
@@ -136,10 +136,11 @@ def forward(self, X: Tensor) -> Tensor:
         """
         if isinstance(self.acq, SampleReducingMCAcquisitionFunction):
             # shape: mc_samples x batch x q-candidates
-            acq_values = self.acq._non_reduce_forward(X)
+            acq_values = self.acq._non_reduced_forward(X)
             weighted_acq_values = self.apply_weight(acq_values, X, self.acq)
-            vals = self.acq._sample_reduction(self.acq._q_reduction(weighted_acq_values))
-            return vals.squeeze(-1)
+            q_reduced_acq = self.acq._q_reduction(weighted_acq_values)
+            sample_reduced_acq = self.acq._sample_reduction(q_reduced_acq)
+            return sample_reduced_acq.squeeze(-1)
 
         # shape: batch x q-candidates
         acq_values = self.acq(X).unsqueeze(-1)

diff --git a/neps/optimizers/bayesian_optimization/models/gp.py b/neps/optimizers/bayesian_optimization/models/gp.py
@@ -5,12 +5,14 @@
 from functools import reduce
 from typing import TYPE_CHECKING, Any, Mapping, TypeVar
 
-from botorch.models import MultiTaskGP
 import gpytorch
 import gpytorch.constraints
 import torch
 from botorch.acquisition.analytic import SingleTaskGP
-from botorch.models.gp_regression_mixed import CategoricalKernel, Likelihood, MixedSingleTaskGP
+from botorch.models.gp_regression_mixed import (
+    CategoricalKernel,
+    Likelihood,
+)
 from botorch.models.transforms.outcome import Standardize
 from botorch.optim import optimize_acqf, optimize_acqf_mixed
 from gpytorch.kernels import MaternKernel, ScaleKernel

diff --git a/neps/optimizers/bayesian_optimization/optimizer.py b/neps/optimizers/bayesian_optimization/optimizer.py
@@ -4,10 +4,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Literal, Mapping
 
 import torch
-from botorch.acquisition import (
-    LinearMCObjective,
-    qLogExpectedImprovement,
-)
+from botorch.acquisition import LinearMCObjective, qLogExpectedImprovement
 from botorch.fit import fit_gpytorch_mll
 from gpytorch import ExactMarginalLogLikelihood
 
@@ -95,25 +92,34 @@ def _missing_cost_strategy(cost: torch.Tensor) -> torch.Tensor:
     return _missing_fill_strategy(cost, strategy="3std", lower_is_better=True)
 
 
-def _pibo_exp_term(n_sampled_already: int, ndims: int, budget_info: BudgetInfo) -> float:
-    if budget_info.max_evaluations is not None:
-        # From the PIBO paper (Section 4.1)
-        # https://arxiv.org/pdf/2204.11051
-        n = n_sampled_already
-        beta = budget_info.max_evaluations / 10
-    elif budget_info.max_cost_budget is not None:
-        # This might not work well if cost number is high
-        # early on, but it will start to normalize.
-        n = budget_info.used_cost_budget
-        beta = budget_info.max_cost_budget / 10
-    else:
-        # Otherwise, just some random heuristic based on the number
-        # of trials and dimensionality of the search space
-        # TODO: Think about and evaluate this more.
-        n = n_sampled_already
-        beta = ndims**2 / 10
-
-    return beta / n
+def _pibo_exp_term(
+    n_sampled_already: int,
+    ndims: int,
+    initial_design_size: int,
+) -> float:
+    # pibo paper
+    # https://arxiv.org/pdf/2204.11051
+    #
+    # they use some constant determined from max problem budget. seems impractical,
+    # given we might not know the final budget (i.e. imagine you iteratively increase
+    # the budget as you go along).
+    #
+    # instead, we base it on the fact that in lower dimensions, we don't to rely
+    # on the prior for too long as the amount of space you need to cover around the
+    # prior is fairly low. effectively, since the gp needs little samples to
+    # model pretty effectively in low dimension, we can derive the utility from
+    # the prior pretty quickly.
+    #
+    # however, for high dimensional settings, we want to rely longer on the prior
+    # for longer as the number of samples needed to model the area around the prior
+    # is much larger, and deriving the utility will take longer.
+    #
+    # in the end, we would like some curve going from 1->0 as n->inf, where `n` is
+    # the number of samples we have done so far.
+    # the easiest function that does this is `exp(-n)`, with some discounting of `n`
+    # dependant on the number of dimensions.
+    n_bo_samples = n_sampled_already - initial_design_size
+    return math.exp(-n_bo_samples / ndims)
 
 
 def _cost_used_budget_percentage(budget_info: BudgetInfo) -> float:
@@ -214,19 +220,21 @@ def __init__(  # noqa: D417
             raise NotImplementedError("Only supports flat search spaces for now!")
         super().__init__(pipeline_space=pipeline_space)
 
-        if initial_design_size is None:
-            N = len(pipeline_space.hyperparameters)
-            initial_design_size = int(max(2, math.log(N) ** 2))
-        elif initial_design_size < 1:
-            raise ValueError("Initial_design_size to be at least 1")
-
         params: dict[str, CategoricalParameter | FloatParameter | IntegerParameter] = {
             **pipeline_space.numerical,
             **pipeline_space.categoricals,
         }
         if treat_fidelity_as_hyperparameters:
             params.update(pipeline_space.fidelities)
 
+        if initial_design_size is None:
+            # As we have fairly regularized GPs, who start with a more smooth landscape
+            # model, we don't need a high level of initial samples.
+            ndims = len(params)
+            initial_design_size = max(2, int(math.log(ndims) ** 2))
+        elif initial_design_size < 1:
+            raise ValueError("Initial_design_size to be at least 1")
+
         self.encoder = TensorEncoder.default(params) if encoder is None else encoder
         self.use_cost = use_cost
         self.prior = _make_prior(params) if use_priors is True else None
@@ -251,9 +259,9 @@ def ask(
                 "Seed is not yet implemented for BayesianOptimization"
             )
 
-        n_trials = len(trials)
+        n_trials_completed = len(trials)
         space = self.pipeline_space
-        config_id = str(n_trials + 1)
+        config_id = str(n_trials_completed + 1)
 
         # Fill intitial design data if we don't have any...
         if self.initial_design_ is None:
@@ -279,8 +287,8 @@ def ask(
             self.initial_design_.extend(configs)
 
         # If we havn't passed the intial design phase
-        if n_trials <= len(self.initial_design_):
-            config = self.initial_design_[n_trials - 1]
+        if n_trials_completed < len(self.initial_design_):
+            config = self.initial_design_[n_trials_completed]
             sample = SampledConfig(id=config_id, config=config, previous_config_id=None)
             return sample, optimizer_state
 
@@ -331,14 +339,25 @@ def ask(
         # If we should use the prior, weight the acquisition function by
         # the probability of it being sampled from the prior.
         if self.prior:
-            acq = pibo_acquisition(
-                acq,
-                prior=self.prior,
-                prior_exponent=_pibo_exp_term(n_trials, self.encoder.ncols, budget_info),
-                x_domain=self.encoder.domains,
-                X_pending=maybe_x_pending_tensor,
+            pibo_exp_term = _pibo_exp_term(
+                n_trials_completed,
+                self.encoder.ncols,
+                self.n_initial_design,
             )
 
+            # If the amount of weight derived from the pibo exponent becomes
+            # insignificant, we don't use it as it as it adds extra computational
+            # burden and introduces more chance of numerical instability.
+            significant_lower_bound = 1e-4
+            if pibo_exp_term > significant_lower_bound:
+                acq = pibo_acquisition(
+                    acq,
+                    prior=self.prior,
+                    prior_exponent=pibo_exp_term,
+                    x_domain=self.encoder.domains,
+                    X_pending=maybe_x_pending_tensor,
+                )
+
         # If we should use cost, weight the acquisition function by the cost
         # of the configurations.
         if self.use_cost:

diff --git a/neps/optimizers/default_searchers/bayesian_optimization.yaml b/neps/optimizers/default_searchers/bayesian_optimization.yaml
@@ -1,6 +1,5 @@
 strategy: bayesian_optimization
 # Arguments that can be modified by the user
-initial_design_size: 10
 surrogate_model: gp  # or {"gp_hierarchy"}
 acquisition: EI  # or {"LogEI", "AEI"}
 log_prior_weighted: false

diff --git a/neps/optimizers/default_searchers/pibo.yaml b/neps/optimizers/default_searchers/pibo.yaml
@@ -1,6 +1,5 @@
 strategy: pibo
 # Arguments that can be modified by the user
-initial_design_size: 10
 surrogate_model: gp  # or {"gp_hierarchy"}
 acquisition: EI  # or {"LogEI", "AEI"}
 log_prior_weighted: false

diff --git a/neps/runtime.py b/neps/runtime.py
@@ -519,8 +519,6 @@ def _launch_runtime(  # noqa: PLR0913
                     max_evaluations=max_evaluations_total,
                     used_evaluations=0,
                 )
-                if max_cost_total is not None
-                else None
             ),
             shared_state={},  # TODO: Unused for the time being...
         ),

diff --git a/neps/sampling/distributions.py b/neps/sampling/distributions.py
@@ -9,18 +9,22 @@
 from typing_extensions import override
 
 import torch
-from torch.distributions import Distribution, constraints
+from torch.distributions import Distribution, Uniform, constraints
 from torch.distributions.utils import broadcast_all
 
+from neps.search_spaces.domain import Domain
+
 if TYPE_CHECKING:
     from neps.search_spaces.architecture.cfg_variants.constrained_cfg import Constraint
-    from neps.search_spaces.domain import Domain
 
-CONST_SQRT_2 = math.sqrt(2)
-CONST_INV_SQRT_2PI = 1 / math.sqrt(2 * math.pi)
-CONST_INV_SQRT_2 = 1 / math.sqrt(2)
-CONST_LOG_INV_SQRT_2PI = math.log(CONST_INV_SQRT_2PI)
-CONST_LOG_SQRT_2PI_E = 0.5 * math.log(2 * math.pi * math.e)
+CONST_SQRT_2 = torch.tensor(math.sqrt(2), dtype=torch.float64)
+CONST_INV_SQRT_2PI = torch.tensor(1 / math.sqrt(2 * math.pi), dtype=torch.float64)
+CONST_INV_SQRT_2 = torch.tensor(1 / math.sqrt(2), dtype=torch.float64)
+CONST_LOG_INV_SQRT_2PI = torch.tensor(math.log(CONST_INV_SQRT_2PI), dtype=torch.float64)
+CONST_LOG_SQRT_2PI_E = torch.tensor(
+    0.5 * math.log(2 * math.pi * math.e),
+    dtype=torch.float64,
+)
 
 # from https://github.com/toshas/torch_truncnorm
 
@@ -224,7 +228,53 @@ def log_prob(self, value):
         return super().log_prob(value) - self._log_scale
 
 
+class UniformWithUpperBound(Uniform):
+    """Uniform distribution with upper bound inclusive.
+
+    This is mostly a hack because torch's version of Uniform does not include
+    the upper bound which only causes a problem when considering the log_prob.
+    Otherwise the upper bound works with every other method.
+    """
+
+    # OPTIM: This could probably be optimized a lot but I'm not sure how it effects
+    # gradients. Could probably do a different path depending on if `value` requires
+    # gradients or not.
+    @override
+    def log_prob(self, value: torch.Tensor) -> torch.Tensor:
+        if self._validate_args:
+            self._validate_sample(value)
+
+        lb = self.low.le(value).type_as(self.low)
+        ub = self.high.ge(value).type_as(self.low)  # The main change, is `gt` in original
+        return torch.log(lb.mul(ub)) - torch.log(self.high - self.low)
+
+
 @dataclass
 class TorchDistributionWithDomain:
     distribution: Distribution
     domain: Domain
+
+
+UNIT_UNIFORM_DIST = TorchDistributionWithDomain(
+    distribution=UniformWithUpperBound(0, 1),
+    domain=Domain.unit_float(),
+)
+
+if __name__ == "__main__":
+    loc = 0.95
+    for confidence in torch.linspace(0.0, 0.8, 8):
+        scale = 1 - confidence
+        dist = TruncatedNormal(
+            loc=loc,
+            scale=scale,
+            a=0.0,
+            b=1.0,
+        )
+        xs = torch.linspace(0, 1, 100)
+        ys = dist.log_prob(xs)
+        import matplotlib.pyplot as plt
+
+        plt.plot(xs, ys, label=f"confidence={confidence}")
+        plt.plot(loc, dist.log_prob(torch.tensor(loc)), "ro")
+    plt.legend()
+    plt.show()