Skip to content

Commit

Permalink
feat: Stable pibo implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
eddiebergman committed Aug 30, 2024
1 parent da4f376 commit 5f76a7e
Show file tree
Hide file tree
Showing 13 changed files with 210 additions and 102 deletions.
5 changes: 2 additions & 3 deletions neps/optimizers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
SuccessiveHalving,
SuccessiveHalvingWithPriors,
)
from .multi_fidelity_prior.async_priorband import PriorBandAsha, PriorBandAshaHB
from .multi_fidelity_prior.priorband import PriorBand
from .random_search.optimizer import RandomSearch
from .regularized_evolution.optimizer import RegularizedEvolution
Expand All @@ -29,8 +28,8 @@

# TODO: Rename Searcher to Optimizer...
SearcherMapping: Mapping[str, Callable[..., BaseOptimizer]] = {
"bayesian_optimization": BayesianOptimization,
"pibo": partial(BayesianOptimization, disable_priors=False),
"bayesian_optimization": partial(BayesianOptimization, use_priors=False),
"pibo": partial(BayesianOptimization, use_priors=True),
"random_search": RandomSearch,
"regularized_evolution": RegularizedEvolution,
"assisted_regularized_evolution": partial(RegularizedEvolution, assisted=True),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,15 @@ def apply_pibo_acquisition_weight(
x_domain: Domain | list[Domain],
prior_exponent: float,
):
import rich

rich.print(prior_exponent)
if acq._log:
return acq_values + prior.log_prob(X, frm=x_domain) * prior_exponent
return acq_values * prior.prob(X, frm=x_domain).pow(prior_exponent)
weighted_log_probs = prior.log_prob(X, frm=x_domain) * prior_exponent
return acq_values + weighted_log_probs

weighted_probs = prior.prob(X, frm=x_domain).pow(prior_exponent)
return acq_values * weighted_probs


def pibo_acquisition(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def __init__(
# NOTE: We remove the X_pending from the base acquisition function as we will get
# it in our own forward with `@concatenate_pending_points` and pass that forward.
# This avoids possible duplicates
self.acq.set_X_pending(None)
acq.set_X_pending(None)
self.set_X_pending(X_pending)
self.apply_weight = apply_weight
self.acq = acq
Expand All @@ -136,10 +136,11 @@ def forward(self, X: Tensor) -> Tensor:
"""
if isinstance(self.acq, SampleReducingMCAcquisitionFunction):
# shape: mc_samples x batch x q-candidates
acq_values = self.acq._non_reduce_forward(X)
acq_values = self.acq._non_reduced_forward(X)
weighted_acq_values = self.apply_weight(acq_values, X, self.acq)
vals = self.acq._sample_reduction(self.acq._q_reduction(weighted_acq_values))
return vals.squeeze(-1)
q_reduced_acq = self.acq._q_reduction(weighted_acq_values)
sample_reduced_acq = self.acq._sample_reduction(q_reduced_acq)
return sample_reduced_acq.squeeze(-1)

# shape: batch x q-candidates
acq_values = self.acq(X).unsqueeze(-1)
Expand Down
6 changes: 4 additions & 2 deletions neps/optimizers/bayesian_optimization/models/gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
from functools import reduce
from typing import TYPE_CHECKING, Any, Mapping, TypeVar

from botorch.models import MultiTaskGP
import gpytorch
import gpytorch.constraints
import torch
from botorch.acquisition.analytic import SingleTaskGP
from botorch.models.gp_regression_mixed import CategoricalKernel, Likelihood, MixedSingleTaskGP
from botorch.models.gp_regression_mixed import (
CategoricalKernel,
Likelihood,
)
from botorch.models.transforms.outcome import Standardize
from botorch.optim import optimize_acqf, optimize_acqf_mixed
from gpytorch.kernels import MaternKernel, ScaleKernel
Expand Down
97 changes: 58 additions & 39 deletions neps/optimizers/bayesian_optimization/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,7 @@
from typing import TYPE_CHECKING, Any, Callable, Literal, Mapping

import torch
from botorch.acquisition import (
LinearMCObjective,
qLogExpectedImprovement,
)
from botorch.acquisition import LinearMCObjective, qLogExpectedImprovement
from botorch.fit import fit_gpytorch_mll
from gpytorch import ExactMarginalLogLikelihood

Expand Down Expand Up @@ -95,25 +92,34 @@ def _missing_cost_strategy(cost: torch.Tensor) -> torch.Tensor:
return _missing_fill_strategy(cost, strategy="3std", lower_is_better=True)


def _pibo_exp_term(n_sampled_already: int, ndims: int, budget_info: BudgetInfo) -> float:
if budget_info.max_evaluations is not None:
# From the PIBO paper (Section 4.1)
# https://arxiv.org/pdf/2204.11051
n = n_sampled_already
beta = budget_info.max_evaluations / 10
elif budget_info.max_cost_budget is not None:
# This might not work well if cost number is high
# early on, but it will start to normalize.
n = budget_info.used_cost_budget
beta = budget_info.max_cost_budget / 10
else:
# Otherwise, just some random heuristic based on the number
# of trials and dimensionality of the search space
# TODO: Think about and evaluate this more.
n = n_sampled_already
beta = ndims**2 / 10

return beta / n
def _pibo_exp_term(
n_sampled_already: int,
ndims: int,
initial_design_size: int,
) -> float:
# pibo paper
# https://arxiv.org/pdf/2204.11051
#
# they use some constant determined from max problem budget. seems impractical,
# given we might not know the final budget (i.e. imagine you iteratively increase
# the budget as you go along).
#
# instead, we base it on the fact that in lower dimensions, we don't to rely
# on the prior for too long as the amount of space you need to cover around the
# prior is fairly low. effectively, since the gp needs little samples to
# model pretty effectively in low dimension, we can derive the utility from
# the prior pretty quickly.
#
# however, for high dimensional settings, we want to rely longer on the prior
# for longer as the number of samples needed to model the area around the prior
# is much larger, and deriving the utility will take longer.
#
# in the end, we would like some curve going from 1->0 as n->inf, where `n` is
# the number of samples we have done so far.
# the easiest function that does this is `exp(-n)`, with some discounting of `n`
# dependant on the number of dimensions.
n_bo_samples = n_sampled_already - initial_design_size
return math.exp(-n_bo_samples / ndims)


def _cost_used_budget_percentage(budget_info: BudgetInfo) -> float:
Expand Down Expand Up @@ -214,19 +220,21 @@ def __init__( # noqa: D417
raise NotImplementedError("Only supports flat search spaces for now!")
super().__init__(pipeline_space=pipeline_space)

if initial_design_size is None:
N = len(pipeline_space.hyperparameters)
initial_design_size = int(max(2, math.log(N) ** 2))
elif initial_design_size < 1:
raise ValueError("Initial_design_size to be at least 1")

params: dict[str, CategoricalParameter | FloatParameter | IntegerParameter] = {
**pipeline_space.numerical,
**pipeline_space.categoricals,
}
if treat_fidelity_as_hyperparameters:
params.update(pipeline_space.fidelities)

if initial_design_size is None:
# As we have fairly regularized GPs, who start with a more smooth landscape
# model, we don't need a high level of initial samples.
ndims = len(params)
initial_design_size = max(2, int(math.log(ndims) ** 2))
elif initial_design_size < 1:
raise ValueError("Initial_design_size to be at least 1")

self.encoder = TensorEncoder.default(params) if encoder is None else encoder
self.use_cost = use_cost
self.prior = _make_prior(params) if use_priors is True else None
Expand All @@ -251,9 +259,9 @@ def ask(
"Seed is not yet implemented for BayesianOptimization"
)

n_trials = len(trials)
n_trials_completed = len(trials)
space = self.pipeline_space
config_id = str(n_trials + 1)
config_id = str(n_trials_completed + 1)

# Fill intitial design data if we don't have any...
if self.initial_design_ is None:
Expand All @@ -279,8 +287,8 @@ def ask(
self.initial_design_.extend(configs)

# If we havn't passed the intial design phase
if n_trials <= len(self.initial_design_):
config = self.initial_design_[n_trials - 1]
if n_trials_completed < len(self.initial_design_):
config = self.initial_design_[n_trials_completed]
sample = SampledConfig(id=config_id, config=config, previous_config_id=None)
return sample, optimizer_state

Expand Down Expand Up @@ -331,14 +339,25 @@ def ask(
# If we should use the prior, weight the acquisition function by
# the probability of it being sampled from the prior.
if self.prior:
acq = pibo_acquisition(
acq,
prior=self.prior,
prior_exponent=_pibo_exp_term(n_trials, self.encoder.ncols, budget_info),
x_domain=self.encoder.domains,
X_pending=maybe_x_pending_tensor,
pibo_exp_term = _pibo_exp_term(
n_trials_completed,
self.encoder.ncols,
self.n_initial_design,
)

# If the amount of weight derived from the pibo exponent becomes
# insignificant, we don't use it as it as it adds extra computational
# burden and introduces more chance of numerical instability.
significant_lower_bound = 1e-4
if pibo_exp_term > significant_lower_bound:
acq = pibo_acquisition(
acq,
prior=self.prior,
prior_exponent=pibo_exp_term,
x_domain=self.encoder.domains,
X_pending=maybe_x_pending_tensor,
)

# If we should use cost, weight the acquisition function by the cost
# of the configurations.
if self.use_cost:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
strategy: bayesian_optimization
# Arguments that can be modified by the user
initial_design_size: 10
surrogate_model: gp # or {"gp_hierarchy"}
acquisition: EI # or {"LogEI", "AEI"}
log_prior_weighted: false
Expand Down
1 change: 0 additions & 1 deletion neps/optimizers/default_searchers/pibo.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
strategy: pibo
# Arguments that can be modified by the user
initial_design_size: 10
surrogate_model: gp # or {"gp_hierarchy"}
acquisition: EI # or {"LogEI", "AEI"}
log_prior_weighted: false
Expand Down
2 changes: 0 additions & 2 deletions neps/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,8 +519,6 @@ def _launch_runtime( # noqa: PLR0913
max_evaluations=max_evaluations_total,
used_evaluations=0,
)
if max_cost_total is not None
else None
),
shared_state={}, # TODO: Unused for the time being...
),
Expand Down
64 changes: 57 additions & 7 deletions neps/sampling/distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,22 @@
from typing_extensions import override

import torch
from torch.distributions import Distribution, constraints
from torch.distributions import Distribution, Uniform, constraints
from torch.distributions.utils import broadcast_all

from neps.search_spaces.domain import Domain

if TYPE_CHECKING:
from neps.search_spaces.architecture.cfg_variants.constrained_cfg import Constraint
from neps.search_spaces.domain import Domain

CONST_SQRT_2 = math.sqrt(2)
CONST_INV_SQRT_2PI = 1 / math.sqrt(2 * math.pi)
CONST_INV_SQRT_2 = 1 / math.sqrt(2)
CONST_LOG_INV_SQRT_2PI = math.log(CONST_INV_SQRT_2PI)
CONST_LOG_SQRT_2PI_E = 0.5 * math.log(2 * math.pi * math.e)
CONST_SQRT_2 = torch.tensor(math.sqrt(2), dtype=torch.float64)
CONST_INV_SQRT_2PI = torch.tensor(1 / math.sqrt(2 * math.pi), dtype=torch.float64)
CONST_INV_SQRT_2 = torch.tensor(1 / math.sqrt(2), dtype=torch.float64)
CONST_LOG_INV_SQRT_2PI = torch.tensor(math.log(CONST_INV_SQRT_2PI), dtype=torch.float64)
CONST_LOG_SQRT_2PI_E = torch.tensor(
0.5 * math.log(2 * math.pi * math.e),
dtype=torch.float64,
)

# from https://github.com/toshas/torch_truncnorm

Expand Down Expand Up @@ -224,7 +228,53 @@ def log_prob(self, value):
return super().log_prob(value) - self._log_scale


class UniformWithUpperBound(Uniform):
"""Uniform distribution with upper bound inclusive.
This is mostly a hack because torch's version of Uniform does not include
the upper bound which only causes a problem when considering the log_prob.
Otherwise the upper bound works with every other method.
"""

# OPTIM: This could probably be optimized a lot but I'm not sure how it effects
# gradients. Could probably do a different path depending on if `value` requires
# gradients or not.
@override
def log_prob(self, value: torch.Tensor) -> torch.Tensor:
if self._validate_args:
self._validate_sample(value)

lb = self.low.le(value).type_as(self.low)
ub = self.high.ge(value).type_as(self.low) # The main change, is `gt` in original
return torch.log(lb.mul(ub)) - torch.log(self.high - self.low)


@dataclass
class TorchDistributionWithDomain:
distribution: Distribution
domain: Domain


UNIT_UNIFORM_DIST = TorchDistributionWithDomain(
distribution=UniformWithUpperBound(0, 1),
domain=Domain.unit_float(),
)

if __name__ == "__main__":
loc = 0.95
for confidence in torch.linspace(0.0, 0.8, 8):
scale = 1 - confidence
dist = TruncatedNormal(
loc=loc,
scale=scale,
a=0.0,
b=1.0,
)
xs = torch.linspace(0, 1, 100)
ys = dist.log_prob(xs)
import matplotlib.pyplot as plt

plt.plot(xs, ys, label=f"confidence={confidence}")
plt.plot(loc, dist.log_prob(torch.tensor(loc)), "ro")
plt.legend()
plt.show()
Loading

0 comments on commit 5f76a7e

Please sign in to comment.