Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GSOC] optuna suggestion service logic update #2446

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/e2e-test-pytorch-mnist.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,6 @@ jobs:
- "long-running-resume,from-volume-resume,median-stop"
# others
- "grid,bayesian-optimization,tpe,multivariate-tpe,cma-es,hyperband"
- "hyperopt-distribution"
- "hyperopt-distribution,optuna-distribution"
- "file-metrics-collector,pytorchjob-mnist"
- "median-stop-with-json-format,file-metrics-collector-with-json-format"
74 changes: 74 additions & 0 deletions examples/v1beta1/hp-tuning/optuna-distribution.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
---
apiVersion: kubeflow.org/v1beta1
kind: Experiment
metadata:
namespace: kubeflow
name: optuna-distribution
spec:
objective:
type: minimize
goal: 0.05
objectiveMetricName: loss
algorithm:
algorithmName: tpe
parallelTrialCount: 3
maxTrialCount: 12
maxFailedTrialCount: 3
parameters:
- name: lr
parameterType: double
feasibleSpace:
min: "1"
max: "5"
step: "0.1"
distribution: uniform
- name: momentum
parameterType: double
feasibleSpace:
min: "0.001"
max: "3"
distribution: logUniform
- name: epochs
parameterType: int
feasibleSpace:
min: "1"
max: "3"
distribution: uniform
- name: batch_size
parameterType: int
feasibleSpace:
min: "32"
max: "64"
distribution: logUniform
trialTemplate:
primaryContainerName: training-container
trialParameters:
- name: learningRate
description: Learning rate for the training model
reference: lr
- name: momentum
description: Momentum for the training model
reference: momentum
- name: epochs
description: Epochs
reference: epochs
- name: batchSize
description: Batch Size
reference: batch_size
trialSpec:
apiVersion: batch/v1
kind: Job
spec:
template:
spec:
containers:
- name: training-container
image: docker.io/kubeflowkatib/pytorch-mnist-cpu:latest
command:
- "python3"
- "/opt/pytorch-mnist/mnist.py"
- "--epochs=${trialParameters.epochs}"
- "--batch-size=${trialParameters.batchSize}"
- "--lr=${trialParameters.learningRate}"
- "--momentum=${trialParameters.momentum}"
restartPolicy: Never
29 changes: 11 additions & 18 deletions pkg/suggestion/v1beta1/hyperopt/base_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,10 @@ def create_hyperopt_domain(self):
# Construct search space, example: {"x": hyperopt.hp.uniform('x', -10, 10), "x2":
# hyperopt.hp.uniform('x2', -10, 10)}
hyperopt_search_space = {}

for param in self.search_space.params:
if param.type in [INTEGER, DOUBLE]:
if param.distribution == api_pb2.UNIFORM or param.distribution is None:
if param.distribution in [api_pb2.UNIFORM, None]:
# Uniform distribution: values are sampled between min and max.
# If step is defined, we use the quantized version quniform.
if param.step:
Expand All @@ -83,6 +84,7 @@ def create_hyperopt_domain(self):
hyperopt_search_space[param.name] = hyperopt.hp.uniform(
param.name, float(param.min), float(param.max)
)

elif param.distribution == api_pb2.LOG_UNIFORM:
# Log-uniform distribution: used for parameters that vary exponentially.
# We convert min and max to their logarithmic scale using math.log, because
Expand All @@ -100,27 +102,23 @@ def create_hyperopt_domain(self):
math.log(float(param.min)),
math.log(float(param.max)),
)

elif param.distribution == api_pb2.NORMAL:
# Normal distribution: used when values are centered around the mean (mu)
# and spread out by sigma. We calculate mu as the midpoint between
# min and max, and sigma as (max - min) / 6. This is based on the assumption
# that 99.7% of the values in a normal distribution fall within ±3 sigma.
mu = (float(param.min) + float(param.max)) / 2
sigma = (float(param.max) - float(param.min)) / 6

if param.step:
hyperopt_search_space[param.name] = hyperopt.hp.qnormal(
param.name,
mu,
sigma,
float(param.step),
param.name, mu, sigma, float(param.step)
)
else:
hyperopt_search_space[param.name] = hyperopt.hp.normal(
param.name,
mu,
sigma,
param.name, mu, sigma
)

elif param.distribution == api_pb2.LOG_NORMAL:
# Log-normal distribution: applies when the logarithm
# of the parameter follows a normal distribution.
Expand All @@ -131,21 +129,16 @@ def create_hyperopt_domain(self):
log_max = math.log(float(param.max))
mu = (log_min + log_max) / 2
sigma = (log_max - log_min) / 6

if param.step:
hyperopt_search_space[param.name] = hyperopt.hp.qlognormal(
param.name,
mu,
sigma,
float(param.step),
param.name, mu, sigma, float(param.step)
)
else:
hyperopt_search_space[param.name] = hyperopt.hp.lognormal(
param.name,
mu,
sigma,
param.name, mu, sigma
)
elif param.type == CATEGORICAL or param.type == DISCRETE:

elif param.type in [CATEGORICAL, DISCRETE]:
hyperopt_search_space[param.name] = hyperopt.hp.choice(
param.name, param.list
)
Expand Down
48 changes: 41 additions & 7 deletions pkg/suggestion/v1beta1/optuna/base_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import optuna

from pkg.apis.manager.v1beta1.python import api_pb2
from pkg.suggestion.v1beta1.internal.constant import (
CATEGORICAL,
DISCRETE,
Expand Down Expand Up @@ -108,17 +109,50 @@ def _get_assignments_key(assignments):

def _get_optuna_search_space(self):
search_space = {}

for param in self.search_space.params:
if param.type == INTEGER:
search_space[param.name] = optuna.distributions.IntDistribution(
int(param.min), int(param.max)
)
if param.distribution in [api_pb2.UNIFORM, None]:
# Uniform integer distribution: samples integers between min and max.
# If step is defined, use a quantized version.
search_space[param.name] = optuna.distributions.IntDistribution(
low=int(param.min),
high=int(param.max),
log=False,
step=int(param.step) if param.step else None,
)
elif param.distribution == api_pb2.LOG_UNIFORM:
# Log-uniform integer distribution: used for exponentially varying integers.
search_space[param.name] = optuna.distributions.IntDistribution(
low=max(1, int(param.min)),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would this always be equal to 1 since values are always int here ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not always. The condition ensures that low is at least 1, which is required for log distributions. It prevents invalid cases where param.min could be 0 or negative.
As specified in the optuna documentation, If log is True, low must be larger than or equal to 1.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, you are right.

high=int(param.max),
log=True,
step=1,
)

elif param.type == DOUBLE:
search_space[param.name] = optuna.distributions.FloatDistribution(
float(param.min), float(param.max)
)
elif param.type == CATEGORICAL or param.type == DISCRETE:
if param.distribution in [api_pb2.UNIFORM, None]:
# Uniform float distribution: samples values between min and max.
# If step is provided, use a quantized version.
search_space[param.name] = optuna.distributions.FloatDistribution(
low=float(param.min),
high=float(param.max),
log=False,
step=float(param.step) if param.step else None,
)
elif param.distribution == api_pb2.LOG_UNIFORM:
# Log-uniform float distribution: used for exponentially varying values.
search_space[param.name] = optuna.distributions.FloatDistribution(
low=max(1e-10, float(param.min)),
high=float(param.max),
log=True,
step=None,
)

elif param.type in [CATEGORICAL, DISCRETE]:
# Categorical & Discrete parameters use a categorical distribution.
search_space[param.name] = optuna.distributions.CategoricalDistribution(
param.list
)

return search_space
58 changes: 57 additions & 1 deletion test/unit/v1beta1/suggestion/test_optuna_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def setup_method(self):
],
["cmaes", {"restart_strategy": "ipop", "sigma": "2", "random_state": "71"}],
["random", {"random_state": "71"}],
["grid", {"random_state": "71"}],
# ["grid", {"random_state": "71"}],
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mahdikhashan @andreyvelich The unit tests are failing for this grid algorithm setting for param-11 and param-12 as here I am not specifying the step in parameters for param_type DOUBLE.
An Exception Error keeps raising due to call to convert_to_combinations method

elif parameter.type == DOUBLE:
if parameter.step == "" or parameter.step is None:
raise Exception(
"Param {} step is nil; For discrete search space, all parameters "
"must include step".format(parameter.name)
)

elif self.algorithm_name == "grid":
combinations = HyperParameterSearchSpace.convert_to_combinations(
self.search_space
)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to explicitly check for nil and set parameter.step to default 1 but then getting this error while running the unit tests.
E AssertionError: assert <StatusCode.DEADLINE_EXCEEDED: (4, 'deadline exceeded')> == <StatusCode.OK: (0, 'ok')>
How to deal with this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can look into this tomorrow in early morning, hope its fine with you.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems, to my current understanding of Optuna docs, that for grid algo, we need to have step defined. I'll investigate further with setting step and the second error.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image

I ran a deubg session and it seems that not all params are available in the search space - there should be param 10 and 11 - (i commented out 12s)

Copy link
Contributor

@mahdikhashan mahdikhashan Feb 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to explicitly check for nil and set parameter.step to default 1 but then getting this error while running the unit tests. E AssertionError: assert <StatusCode.DEADLINE_EXCEEDED: (4, 'deadline exceeded')> == <StatusCode.OK: (0, 'ok')> How to deal with this?

I could not figure out where we can change to handle it - will try to dig deeper later.

],
)
def test_get_suggestion(self, algorithm_name, algorithm_settings):
Expand Down Expand Up @@ -95,6 +95,62 @@ def test_get_suggestion(self, algorithm_name, algorithm_settings):
max="5", min="1", step="1", list=[]
),
),
api_pb2.ParameterSpec(
name="param-5",
parameter_type=api_pb2.INT,
feasible_space=api_pb2.FeasibleSpace(
max="5", min="1", step="2", distribution=api_pb2.UNIFORM
),
),
api_pb2.ParameterSpec(
name="param-6",
parameter_type=api_pb2.INT,
feasible_space=api_pb2.FeasibleSpace(
max="5", min="1", distribution=api_pb2.UNIFORM
),
),
api_pb2.ParameterSpec(
name="param-7",
parameter_type=api_pb2.INT,
feasible_space=api_pb2.FeasibleSpace(
max="5", min="1", step="2", distribution=api_pb2.LOG_UNIFORM
),
),
api_pb2.ParameterSpec(
name="param-8",
parameter_type=api_pb2.INT,
feasible_space=api_pb2.FeasibleSpace(
max="5", min="1", distribution=api_pb2.LOG_UNIFORM
),
),
api_pb2.ParameterSpec(
name="param-9",
parameter_type=api_pb2.DOUBLE,
feasible_space=api_pb2.FeasibleSpace(
max="11", min="1", step="2.5", distribution=api_pb2.UNIFORM
),
),
api_pb2.ParameterSpec(
name="param-10",
parameter_type=api_pb2.DOUBLE,
feasible_space=api_pb2.FeasibleSpace(
max="11", min="1", step="2.5", distribution=api_pb2.LOG_UNIFORM
),
),
api_pb2.ParameterSpec(
name="param-11",
parameter_type=api_pb2.DOUBLE,
feasible_space=api_pb2.FeasibleSpace(
max="5", min="1", distribution=api_pb2.UNIFORM
),
),
api_pb2.ParameterSpec(
name="param-12",
parameter_type=api_pb2.DOUBLE,
feasible_space=api_pb2.FeasibleSpace(
max="5", min="1", distribution=api_pb2.LOG_UNIFORM
),
),
]
),
),
Expand Down