Skip to content

Commit

Permalink
Merge pull request #435 from MannLabs/dynamic-hyperparameter
Browse files Browse the repository at this point in the history
add dynamic hyperparameter scaling
  • Loading branch information
GeorgWa authored Jan 17, 2025
2 parents b9aa274 + acc0dad commit 5f24fa3
Show file tree
Hide file tree
Showing 3 changed files with 87 additions and 1 deletion.
52 changes: 52 additions & 0 deletions alphadia/fdrexperimental.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# native imports
import logging
import warnings
from abc import ABC, abstractmethod
from copy import deepcopy
Expand All @@ -13,6 +14,8 @@
from torchmetrics.classification import BinaryAUROC
from tqdm import tqdm

logger = logging.getLogger()


class Classifier(ABC):
"""Abstract base class for classifiers.
Expand Down Expand Up @@ -905,6 +908,42 @@ def predict_proba(self, x: np.ndarray):
return self.network(torch.Tensor(x)).detach().numpy()


def get_scaled_training_params(df, base_lr=0.001, max_batch=1024, min_batch=64):
"""
Scale batch size and learning rate based on dataframe size using square root relationship.
Parameters
----------
df : pandas.DataFrame
Input dataframe
base_lr : float, optional
Base learning rate for 1024 batch size, defaults to 0.01
max_batch : int, optional
Maximum batch size (1024 for >= 1M samples), defaults to 1024
min_batch : int, optional
Minimum batch size, defaults to 32
Returns
-------
tuple(int, float)
(batch_size, learning_rate)
"""
n_samples = len(df)

# For >= 1M samples, use max batch size
if n_samples >= 1_000_000:
return max_batch, base_lr

# Calculate scaled batch size (linear scaling between min and max)
batch_size = int(np.clip((n_samples / 1_000_000) * max_batch, min_batch, max_batch))

# Scale learning rate using square root relationship
# sqrt(batch_size) / sqrt(max_batch) = scaled_lr / base_lr
learning_rate = base_lr * np.sqrt(batch_size / max_batch)

return batch_size, learning_rate


class BinaryClassifierLegacyNewBatching(Classifier):
def __init__(
self,
Expand All @@ -918,6 +957,7 @@ def __init__(
layers: list[int] | None = None,
dropout: float = 0.001,
metric_interval: int = 1000,
experimental_hyperparameter_tuning: bool = False,
**kwargs,
):
"""Binary Classifier using a feed forward neural network.
Expand Down Expand Up @@ -955,6 +995,9 @@ def __init__(
metric_interval : int, default=1000
Interval for logging metrics during training.
experimental_hyperparameter_tuning: bool, default=False
Whether to use experimental hyperparameter tuning.
"""
if layers is None:
layers = [100, 50, 20, 5]
Expand All @@ -968,6 +1011,7 @@ def __init__(
self.input_dim = input_dim
self.output_dim = output_dim
self.metric_interval = metric_interval
self.experimental_hyperparameter_tuning = experimental_hyperparameter_tuning

self.network = None
self.optimizer = None
Expand Down Expand Up @@ -1066,6 +1110,14 @@ def fit(self, x: np.ndarray, y: np.ndarray):
Target values of shape (n_samples,) or (n_samples, n_classes).
"""
if self.experimental_hyperparameter_tuning:
self.batch_size, self.learning_rate = get_scaled_training_params(x)
logger.info(
f"Estimating optimal hyperparameters - "
f"samples: {len(x):,}, "
f"batch_size: {self.batch_size:,}, "
f"learning_rate: {self.learning_rate:.2e}"
)

force_reinit = False

Expand Down
6 changes: 5 additions & 1 deletion alphadia/workflow/peptidecentric.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,11 @@
]

classifier_base = fdrx.BinaryClassifierLegacyNewBatching(
test_size=0.001, batch_size=5000, learning_rate=0.001, epochs=10
test_size=0.001,
batch_size=5000,
learning_rate=0.001,
epochs=10,
experimental_hyperparameter_tuning=True,
)


Expand Down
30 changes: 30 additions & 0 deletions tests/unit_tests/test_fdrx_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import numpy as np
import pandas as pd
import pytest
from sklearn.linear_model import LogisticRegression

from alphadia.fdrexperimental import get_scaled_training_params
from alphadia.fdrx.base import TargetDecoyFDR


Expand Down Expand Up @@ -45,3 +47,31 @@ def test_target_decoy_fdr(mock_show):
assert all([col in df.columns for col in ["decoy_proba", "qval", "pep"]])
assert np.all(df[["decoy_proba", "qval", "pep"]].values >= 0)
assert np.all(df[["decoy_proba", "qval", "pep"]].values <= 1)


@pytest.mark.parametrize(
"n_samples,expected_batch,expected_lr",
[
# Large dataset case (≥1M samples)
(1_000_000, 1024, 0.001),
(2_000_000, 1024, 0.001),
# Mid-size dataset cases
(500_000, 512, 0.001 * np.sqrt(512 / 1024)), # 50% of max
(250_000, 256, 0.001 * np.sqrt(256 / 1024)), # 25% of max
# Small dataset cases
(50_000, 64, 0.001 * np.sqrt(64 / 1024)), # Should hit min batch size
(1_000, 64, 0.001 * np.sqrt(64 / 1024)), # Should hit min batch size
],
)
def test_get_scaled_training_params(n_samples, expected_batch, expected_lr):
# Create dummy dataframe with specified number of samples
df = pd.DataFrame({"col1": range(n_samples)})

# Get scaled parameters
batch_size, learning_rate = get_scaled_training_params(df)

# Check batch size matches expected
assert batch_size == expected_batch

# Check learning rate matches expected (within floating point precision)
assert np.isclose(learning_rate, expected_lr, rtol=1e-10)

0 comments on commit 5f24fa3

Please sign in to comment.