Merge pull request #435 from MannLabs/dynamic-hyperparameter

add dynamic hyperparameter scaling
MannLabs · Jan 17, 2025 · 5f24fa3 · 5f24fa3
2 parents b9aa274 + acc0dad
commit 5f24fa3
Show file tree

Hide file tree

Showing 3 changed files with 87 additions and 1 deletion.
diff --git a/alphadia/fdrexperimental.py b/alphadia/fdrexperimental.py
@@ -1,4 +1,5 @@
 # native imports
+import logging
 import warnings
 from abc import ABC, abstractmethod
 from copy import deepcopy
@@ -13,6 +14,8 @@
 from torchmetrics.classification import BinaryAUROC
 from tqdm import tqdm
 
+logger = logging.getLogger()
+
 
 class Classifier(ABC):
     """Abstract base class for classifiers.
@@ -905,6 +908,42 @@ def predict_proba(self, x: np.ndarray):
         return self.network(torch.Tensor(x)).detach().numpy()
 
 
+def get_scaled_training_params(df, base_lr=0.001, max_batch=1024, min_batch=64):
+    """
+    Scale batch size and learning rate based on dataframe size using square root relationship.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        Input dataframe
+    base_lr : float, optional
+        Base learning rate for 1024 batch size, defaults to 0.01
+    max_batch : int, optional
+        Maximum batch size (1024 for >= 1M samples), defaults to 1024
+    min_batch : int, optional
+        Minimum batch size, defaults to 32
+
+    Returns
+    -------
+    tuple(int, float)
+        (batch_size, learning_rate)
+    """
+    n_samples = len(df)
+
+    # For >= 1M samples, use max batch size
+    if n_samples >= 1_000_000:
+        return max_batch, base_lr
+
+    # Calculate scaled batch size (linear scaling between min and max)
+    batch_size = int(np.clip((n_samples / 1_000_000) * max_batch, min_batch, max_batch))
+
+    # Scale learning rate using square root relationship
+    # sqrt(batch_size) / sqrt(max_batch) = scaled_lr / base_lr
+    learning_rate = base_lr * np.sqrt(batch_size / max_batch)
+
+    return batch_size, learning_rate
+
+
 class BinaryClassifierLegacyNewBatching(Classifier):
     def __init__(
         self,
@@ -918,6 +957,7 @@ def __init__(
         layers: list[int] | None = None,
         dropout: float = 0.001,
         metric_interval: int = 1000,
+        experimental_hyperparameter_tuning: bool = False,
         **kwargs,
     ):
         """Binary Classifier using a feed forward neural network.
@@ -955,6 +995,9 @@ def __init__(
         metric_interval : int, default=1000
             Interval for logging metrics during training.
 
+        experimental_hyperparameter_tuning: bool, default=False
+            Whether to use experimental hyperparameter tuning.
+
         """
         if layers is None:
             layers = [100, 50, 20, 5]
@@ -968,6 +1011,7 @@ def __init__(
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.metric_interval = metric_interval
+        self.experimental_hyperparameter_tuning = experimental_hyperparameter_tuning
 
         self.network = None
         self.optimizer = None
@@ -1066,6 +1110,14 @@ def fit(self, x: np.ndarray, y: np.ndarray):
             Target values of shape (n_samples,) or (n_samples, n_classes).
 
         """
+        if self.experimental_hyperparameter_tuning:
+            self.batch_size, self.learning_rate = get_scaled_training_params(x)
+            logger.info(
+                f"Estimating optimal hyperparameters - "
+                f"samples: {len(x):,}, "
+                f"batch_size: {self.batch_size:,}, "
+                f"learning_rate: {self.learning_rate:.2e}"
+            )
 
         force_reinit = False
 

diff --git a/alphadia/workflow/peptidecentric.py b/alphadia/workflow/peptidecentric.py
@@ -95,7 +95,11 @@
 ]
 
 classifier_base = fdrx.BinaryClassifierLegacyNewBatching(
-    test_size=0.001, batch_size=5000, learning_rate=0.001, epochs=10
+    test_size=0.001,
+    batch_size=5000,
+    learning_rate=0.001,
+    epochs=10,
+    experimental_hyperparameter_tuning=True,
 )
 
 

diff --git a/tests/unit_tests/test_fdrx_base.py b/tests/unit_tests/test_fdrx_base.py
@@ -2,8 +2,10 @@
 
 import numpy as np
 import pandas as pd
+import pytest
 from sklearn.linear_model import LogisticRegression
 
+from alphadia.fdrexperimental import get_scaled_training_params
 from alphadia.fdrx.base import TargetDecoyFDR
 
 
@@ -45,3 +47,31 @@ def test_target_decoy_fdr(mock_show):
     assert all([col in df.columns for col in ["decoy_proba", "qval", "pep"]])
     assert np.all(df[["decoy_proba", "qval", "pep"]].values >= 0)
     assert np.all(df[["decoy_proba", "qval", "pep"]].values <= 1)
+
+
+@pytest.mark.parametrize(
+    "n_samples,expected_batch,expected_lr",
+    [
+        # Large dataset case (≥1M samples)
+        (1_000_000, 1024, 0.001),
+        (2_000_000, 1024, 0.001),
+        # Mid-size dataset cases
+        (500_000, 512, 0.001 * np.sqrt(512 / 1024)),  # 50% of max
+        (250_000, 256, 0.001 * np.sqrt(256 / 1024)),  # 25% of max
+        # Small dataset cases
+        (50_000, 64, 0.001 * np.sqrt(64 / 1024)),  # Should hit min batch size
+        (1_000, 64, 0.001 * np.sqrt(64 / 1024)),  # Should hit min batch size
+    ],
+)
+def test_get_scaled_training_params(n_samples, expected_batch, expected_lr):
+    # Create dummy dataframe with specified number of samples
+    df = pd.DataFrame({"col1": range(n_samples)})
+
+    # Get scaled parameters
+    batch_size, learning_rate = get_scaled_training_params(df)
+
+    # Check batch size matches expected
+    assert batch_size == expected_batch
+
+    # Check learning rate matches expected (within floating point precision)
+    assert np.isclose(learning_rate, expected_lr, rtol=1e-10)