Merge pull request #321 from kozistr/feature/orthogonalize

kozistr · web-flow · commit 5baa713816d4 · 2025-01-14T01:06:51.000+09:00
[Feature] Implement `OrthoGrad` optimizer
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@
 
 ## The reasons why you use `pytorch-optimizer`.
 
-* Wide range of supported optimizers. Currently, **87 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
+* Wide range of supported optimizers. Currently, **89 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
 * Including many variants such as `Cautious`, `AdamD`, `Gradient Centrailiaztion`
 * Easy to use, clean, and tested codes
 * Active maintenance
@@ -195,7 +195,9 @@ get_supported_optimizers(['adam*', 'ranger*'])
 | APOLLO        | *SGD-like Memory, AdamW-level Performance*                                                        | [github](https://github.com/zhuhanqing/APOLLO)                                                                 | <https://arxiv.org/abs/2412.05270>                                                          | [cite](https://github.com/zhuhanqing/APOLLO?tab=readme-ov-file#-citation)                                                           |
 | MARS          | *Unleashing the Power of Variance Reduction for Training Large Models*                            | [github](https://github.com/AGI-Arena/MARS)                                                                    | <https://arxiv.org/abs/2411.10438>                                                          | [cite](https://github.com/AGI-Arena/MARS/tree/main?tab=readme-ov-file#citation)                                                     |
 | SGDSaI        | *No More Adam: Learning Rate Scaling at Initialization is All You Need*                           | [github](https://github.com/AnonymousAlethiometer/SGD_SaI)                                                     | <https://arxiv.org/abs/2411.10438>                                                          | [cite](https://github.com/AnonymousAlethiometer/SGD_SaI?tab=readme-ov-file#citation)                                                |
-| Grams         | *Grams: Gradient Descent with Adaptive Momentum Scaling*                                          |                                                                                                                | <https://arxiv.org/abs/2412.17107>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv241217107C/exportcitation)                                                        |
+| Grams         | *Gradient Descent with Adaptive Momentum Scaling*                                                 |                                                                                                                | <https://arxiv.org/abs/2412.17107>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv241217107C/exportcitation)                                                        |
+| OrthoGrad     | *Grokking at the Edge of Numerical Stability*                                                     | [github](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability)                         | <https://arxiv.org/abs/2501.04697>                                                          | [cite](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability?tab=readme-ov-file#citation)                    |
+| Adam-ATAN2    | *Scaling Exponents Across Parameterizations and Optimizers*                                       |                                                                                                                | <https://arxiv.org/abs/2407.05872>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240705872E/exportcitation)                                                        |
 
 ## Supported LR Scheduler
 
@@ -371,6 +373,10 @@ Correcting the norm of a gradient in each iteration based on the adaptive traini
 
 Updates only occur when the proposed update direction aligns with the current gradient.
 
+### Adam-ATAN2
+
+Adam-atan2 is a new numerically stable, scale-invariant version of Adam that eliminates the epsilon hyperparameter.
+
 ## Frequently asked questions
 
 [here](docs/qa.md)
diff --git a/docs/changelogs/v3.3.3.md b/docs/changelogs/v3.3.3.md
@@ -4,7 +4,11 @@
 
 * Implement `Grams` optimizer. (#317, #318)
     * [Grams: Gradient Descent with Adaptive Momentum Scaling](https://arxiv.org/abs/2412.17107) 
-* Support `stable_adamw` variant for `ADOPT` and `AdEMAMix` optimizer. (#320)
+* Support `stable_adamw` variant for `ADOPT` and `AdEMAMix` optimizer. (#321)
     * `optimizer = ADOPT(model.parameters(), ..., stable_adamw=True)`
-* Implement an experimental optimizer `Ranger25` (not tested). (#320)
+* Implement an experimental optimizer `Ranger25` (not tested). (#321)
     * mixing `ADOPT + AdEMAMix + StableAdamW + Cautious + RAdam` optimizers.
+* Implement `OrthoGrad` optimizer. (#321)
+    * [Grokking at the Edge of Numerical Stability](https://arxiv.org/abs/2501.04697)
+* Support `Adam-Atan2` feature for `Prodigy` optimizer when `eps` is None. (#321)
+    * [Scaling Exponents Across Parameterizations and Optimizers](https://arxiv.org/abs/2407.05872) 
diff --git a/docs/index.md b/docs/index.md
@@ -10,7 +10,7 @@
 
 ## The reasons why you use `pytorch-optimizer`.
 
-* Wide range of supported optimizers. Currently, **87 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
+* Wide range of supported optimizers. Currently, **89 optimizers (+ `bitsandbytes`, `qgalore`, `torchao`)**, **16 lr schedulers**, and **13 loss functions** are supported!
 * Including many variants such as `Cautious`, `AdamD`, `Gradient Centrailiaztion`
 * Easy to use, clean, and tested codes
 * Active maintenance
@@ -195,7 +195,9 @@ get_supported_optimizers(['adam*', 'ranger*'])
 | APOLLO        | *SGD-like Memory, AdamW-level Performance*                                                        | [github](https://github.com/zhuhanqing/APOLLO)                                                                 | <https://arxiv.org/abs/2412.05270>                                                          | [cite](https://github.com/zhuhanqing/APOLLO?tab=readme-ov-file#-citation)                                                           |
 | MARS          | *Unleashing the Power of Variance Reduction for Training Large Models*                            | [github](https://github.com/AGI-Arena/MARS)                                                                    | <https://arxiv.org/abs/2411.10438>                                                          | [cite](https://github.com/AGI-Arena/MARS/tree/main?tab=readme-ov-file#citation)                                                     |
 | SGDSaI        | *No More Adam: Learning Rate Scaling at Initialization is All You Need*                           | [github](https://github.com/AnonymousAlethiometer/SGD_SaI)                                                     | <https://arxiv.org/abs/2411.10438>                                                          | [cite](https://github.com/AnonymousAlethiometer/SGD_SaI?tab=readme-ov-file#citation)                                                |
-| Grams         | *Grams: Gradient Descent with Adaptive Momentum Scaling*                                          |                                                                                                                | <https://arxiv.org/abs/2412.17107>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv241217107C/exportcitation)                                                        |
+| Grams         | *Gradient Descent with Adaptive Momentum Scaling*                                                 |                                                                                                                | <https://arxiv.org/abs/2412.17107>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv241217107C/exportcitation)                                                        |
+| OrthoGrad     | *Grokking at the Edge of Numerical Stability*                                                     | [github](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability)                         | <https://arxiv.org/abs/2501.04697>                                                          | [cite](https://github.com/LucasPrietoAl/grokking-at-the-edge-of-numerical-stability?tab=readme-ov-file#citation)                    |
+| Adam-ATAN2    | *Scaling Exponents Across Parameterizations and Optimizers*                                       |                                                                                                                | <https://arxiv.org/abs/2407.05872>                                                          | [cite](https://ui.adsabs.harvard.edu/abs/2024arXiv240705872E/exportcitation)                                                        |
 
 ## Supported LR Scheduler
 
@@ -371,6 +373,10 @@ Correcting the norm of a gradient in each iteration based on the adaptive traini
 
 Updates only occur when the proposed update direction aligns with the current gradient.
 
+### Adam-ATAN2
+
+Adam-atan2 is a new numerically stable, scale-invariant version of Adam that eliminates the epsilon hyperparameter.
+
 ## Frequently asked questions
 
 [here](docs/qa.md)
diff --git a/docs/optimizer.md b/docs/optimizer.md
@@ -256,6 +256,10 @@
     :docstring:
     :members:
 
+::: pytorch_optimizer.OrthoGrad
+    :docstring:
+    :members:
+
 ::: pytorch_optimizer.PAdam
     :docstring:
     :members:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pytorch_optimizer"
-version = "3.3.2"
+version = "3.3.3"
 description = "optimizer & lr scheduler & objective function collections in PyTorch"
 license = "Apache-2.0"
 authors = ["kozistr <kozistr@gmail.com>"]
@@ -16,11 +16,11 @@ keywords = [
     "Apollo", "APOLLO", "AvaGrad", "bSAM", "CAME", "DAdaptAdaGrad", "DAdaptAdam", "DAdaptAdan", "DAdaptSGD",
     "DAdaptLion", "DeMo", "DiffGrad", "FAdam", "Fromage", "FTRL", "GaLore", "Grams", "Gravity", "GrokFast", "GSAM",
     "Kate", "Lamb", "LaProp", "LARS", "Lion", "LOMO", "Lookahead", "MADGRAD", "MARS", "MSVAG", "Muno", "Nero",
-    "NovoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger", "Ranger21", "RotoGrad",
-    "SAM", "ScheduleFreeSGD", "ScheduleFreeAdamW", "ScheduleFreeRAdam", "SGDP", "Shampoo", "ScalableShampoo", "SGDW",
-    "SignSGD", "SM3", "SOAP", "SopihaH", "SRMM", "StableAdamW", "SWATS", "Tiger", "TRAC", "WSAM", "Yogi", "BCE",
-    "BCEFocal", "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered", "Tversky", "FocalTversky",
-    "LovaszHinge", "bitsandbytes", "WSD", "QGaLore",
+    "NovoGrad", "OrthoGrad", "PAdam", "PCGrad", "PID", "PNM", "Prodigy", "QHAdam", "QHM", "RAdam", "Ranger",
+    "Ranger21", "RotoGrad", "SAM", "ScheduleFreeSGD", "ScheduleFreeAdamW", "ScheduleFreeRAdam", "SGDP", "Shampoo",
+    "ScalableShampoo", "SGDW", "SignSGD", "SM3", "SOAP", "SopihaH", "SRMM", "StableAdamW", "SWATS", "Tiger", "TRAC",
+    "WSAM", "Yogi", "BCE", "BCEFocal", "Focal", "FocalCosine", "SoftF1", "Dice", "LDAM", "Jaccard", "Bi-Tempered",
+    "Tversky", "FocalTversky", "LovaszHinge", "bitsandbytes", "WSD", "QGaLore",
 ]
 classifiers = [
     "License :: OSI Approved :: Apache Software License",
diff --git a/pytorch_optimizer/__init__.py b/pytorch_optimizer/__init__.py
@@ -116,13 +116,15 @@
     Muon,
     Nero,
     NovoGrad,
+    OrthoGrad,
     PAdam,
     PCGrad,
     Prodigy,
     QHAdam,
     RAdam,
     Ranger,
     Ranger21,
+    Ranger25,
     RotoGrad,
     SafeFP16Optimizer,
     ScalableShampoo,
diff --git a/pytorch_optimizer/optimizer/__init__.py b/pytorch_optimizer/optimizer/__init__.py
@@ -63,6 +63,7 @@
 from pytorch_optimizer.optimizer.muon import Muon
 from pytorch_optimizer.optimizer.nero import Nero
 from pytorch_optimizer.optimizer.novograd import NovoGrad
+from pytorch_optimizer.optimizer.orthograd import OrthoGrad
 from pytorch_optimizer.optimizer.padam import PAdam
 from pytorch_optimizer.optimizer.pcgrad import PCGrad
 from pytorch_optimizer.optimizer.pid import PID
diff --git a/pytorch_optimizer/optimizer/orthograd.py b/pytorch_optimizer/optimizer/orthograd.py
@@ -0,0 +1,48 @@
+import torch
+
+from pytorch_optimizer.base.optimizer import BaseOptimizer
+from pytorch_optimizer.base.types import CLOSURE, LOSS, OPTIMIZER, PARAMETERS
+
+
+class OrthoGrad(BaseOptimizer):
+    r"""Grokking at the Edge of Numerical Stability.
+
+    A wrapper optimizer that projects gradients to be orthogonal to the current parameters before performing an update.
+
+    :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups.
+    :param optimizer: OPTIMIZER. base optimizer.
+    """
+
+    def __init__(self, params: PARAMETERS, optimizer: OPTIMIZER = torch.optim.AdamW, **kwargs):
+        self.eps: float = 1e-30
+
+        super().__init__(params, {})
+        self.base_optimizer = optimizer(self.param_groups, **kwargs)
+
+    def __str__(self) -> str:
+        return 'OrthoGrad'
+
+    @torch.no_grad()
+    def reset(self):
+        pass
+
+    @torch.no_grad()
+    def orthogonalize_gradients(self, params) -> None:
+        for p in params:
+            if p.grad is None:
+                continue
+
+            w = p.view(-1)
+            g = p.grad.view(-1)
+
+            proj = torch.dot(w, g).div_(torch.dot(w, w).add_(self.eps))
+            g_ortho = g.to(dtype=torch.float32, copy=True).sub_(w, alpha=proj)
+            g_ortho_scaled = g_ortho.mul_(g.norm(2).div_(g_ortho.norm(2).add_(self.eps)))
+
+            p.grad.copy_(g_ortho_scaled.view_as(p.grad))
+
+    @torch.no_grad()
+    def step(self, closure: CLOSURE = None) -> LOSS:
+        for group in self.param_groups:
+            self.orthogonalize_gradients(group['params'])
+        return self.base_optimizer.step(closure)
diff --git a/pytorch_optimizer/optimizer/prodigy.py b/pytorch_optimizer/optimizer/prodigy.py
@@ -16,7 +16,7 @@ class Prodigy(BaseOptimizer):
     :param params: PARAMETERS. iterable of parameters to optimize or dicts defining parameter groups.
     :param lr: float. learning rate.
     :param betas: BETAS. betas.
-    :param beta3: float. coefficients for computing the Prodidy step-size using running averages. If set to None,
+    :param beta3: float. coefficients for computing the Prodigy step-size using running averages. If set to None,
         uses the value of square root of beta2.
     :param d0: float. initial D estimate for D-adaptation (default 1e-6). Rarely needs changing.
     :param d_coef: float. Coefficient in the expression for the estimate of d.
@@ -26,7 +26,8 @@ class Prodigy(BaseOptimizer):
     :param fixed_decay: bool. fix weight decay.
     :param bias_correction: bool. turn on Adam's bias correction.
     :param safeguard_warmup: bool. remove lr from the denominator of D estimate to avoid issues during warm-up stage.
-    :param eps: float. term added to the denominator to improve numerical stability.
+    :param eps: float. term added to the denominator to improve numerical stability. when eps is None, use atan2 rather
+        than epsilon and division for parameter updates.
     """
 
     def __init__(
@@ -43,7 +44,7 @@ def __init__(
         fixed_decay: bool = False,
         bias_correction: bool = False,
         safeguard_warmup: bool = False,
-        eps: float = 1e-8,
+        eps: Optional[float] = 1e-8,
         **kwargs,
     ):
         self.validate_learning_rate(lr)
@@ -172,8 +173,6 @@ def step(self, closure: CLOSURE = None) -> LOSS:
 
                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
 
-                de_nom = exp_avg_sq.sqrt().add_(d * group['eps'])
-
                 self.apply_weight_decay(
                     p,
                     p.grad,
@@ -183,6 +182,13 @@ def step(self, closure: CLOSURE = None) -> LOSS:
                     fixed_decay=group['fixed_decay'],
                 )
 
-                p.addcdiv_(exp_avg, de_nom, value=-d_lr)
+                de_nom = exp_avg_sq.sqrt()
+
+                if group['eps'] is not None:
+                    de_nom.add_(d * group['eps'])
+                    p.addcdiv_(exp_avg, de_nom, value=-d_lr)
+                else:
+                    update = exp_avg.clone().atan2_(de_nom)
+                    p.add_(update, alpha=-d_lr)
 
         return loss
diff --git a/tests/constants.py b/tests/constants.py
@@ -84,7 +84,7 @@
     Tiger,
     Yogi,
 )
-from tests.utils import build_lookahead
+from tests.utils import build_lookahead, build_orthograd
 
 DECOUPLE_FLAGS: List[bool] = [True, False]
 ADAPTIVE_FLAGS: List[bool] = [True, False]
@@ -115,6 +115,7 @@
     'radam',
     'ranger',
     'ranger21',
+    'ranger25',
     'pnm',
     'adapnm',
     'adan',
@@ -180,6 +181,7 @@
 
 OPTIMIZERS: List[Tuple[Any, Dict[str, Union[float, bool, int]], int]] = [
     (build_lookahead, {'lr': 5e-1, 'weight_decay': 1e-3}, 5),
+    (build_orthograd, {'lr': 5e-1, 'weight_decay': 1e-3}, 5),
     (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3}, 5),
     (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'ams_bound': True}, 5),
     (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'weight_decouple': False}, 5),
@@ -441,6 +443,7 @@
     (SWATS, {'lr': 5e-1, 'weight_decay': 1e-3}, 5),
     (SWATS, {'lr': 5e-1, 'weight_decay': 1e-3, 'ams_bound': True}, 5),
     (Prodigy, {'lr': 5e1, 'beta3': None, 'weight_decay': 1e-3}, 10),
+    (Prodigy, {'lr': 5e0, 'beta3': None, 'weight_decay': 1e-3, 'eps': None}, 15),
     (Prodigy, {'lr': 5e1, 'beta3': 0.999, 'weight_decay': 1e-3}, 10),
     (Prodigy, {'lr': 1e1, 'beta3': 0.999, 'weight_decay': 1e-3, 'bias_correction': True}, 15),
     (Prodigy, {'lr': 1e0, 'beta3': 0.999, 'weight_decay': 1e-3, 'safeguard_warmup': True}, 15),
@@ -545,6 +548,7 @@
     (SGDSaI, {'lr': 1e0, 'momentum': 0.0}, 15),
     (Grams, {'lr': 1e-1, 'weight_decay': 1e-3}, 5),
     (Ranger25, {'lr': 1e-1}, 25),
+    (Ranger25, {'lr': 1e-1, 't_alpha_beta3': 5}, 25),
 ]
 ADANORM_SUPPORTED_OPTIMIZERS: List[Tuple[Any, Dict[str, Union[float, bool, int]], int]] = [
     (AdaBelief, {'lr': 5e-1, 'weight_decay': 1e-3, 'adanorm': True}, 10),
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -28,7 +28,7 @@
     to_real,
     unit_norm,
 )
-from tests.utils import Example
+from tests.utils import Example, build_orthograd
 
 
 def test_has_overflow():
@@ -261,3 +261,8 @@ def test_cpu_offload_optimizer():
 
     state_dict = opt.state_dict()
     opt.load_state_dict(state_dict)
+
+
+def test_orthograd_name():
+    optimizer = build_orthograd(Example().parameters())
+    assert str(optimizer).lower() == 'orthograd'
diff --git a/tests/utils.py b/tests/utils.py
@@ -6,7 +6,7 @@
 from torch.nn import functional as f
 
 from pytorch_optimizer.base.types import LOSS
-from pytorch_optimizer.optimizer import AdamW, Lookahead
+from pytorch_optimizer.optimizer import AdamW, Lookahead, OrthoGrad
 
 
 class LogisticRegression(nn.Module):
@@ -107,6 +107,10 @@ def build_lookahead(*parameters, **kwargs):
     return Lookahead(AdamW(*parameters, **kwargs))
 
 
+def build_orthograd(*parameters, **kwargs):
+    return OrthoGrad(*parameters, torch.optim.AdamW, **kwargs)
+
+
 def ids(v) -> str:
     return f'{v[0].__name__}_{v[1:]}'