From b0ed54c5bb38a3a8be3ff7576a89c3e021b80155 Mon Sep 17 00:00:00 2001
From: Roy Smart <roytsmart@gmail.com>
Date: Sun, 25 Aug 2024 14:00:51 -0600
Subject: [PATCH 1/8] Fixed bug in convergence criterion for
 `named_arrays.optimize` functions.

---
 named_arrays/_scalars/scalar_named_array_functions.py | 2 +-
 named_arrays/_vectors/vector_named_array_functions.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/named_arrays/_scalars/scalar_named_array_functions.py b/named_arrays/_scalars/scalar_named_array_functions.py
index b3bb888..d0d6edc 100644
--- a/named_arrays/_scalars/scalar_named_array_functions.py
+++ b/named_arrays/_scalars/scalar_named_array_functions.py
@@ -917,7 +917,7 @@ def optimize_root_newton(
         if callback is not None:
             callback(i, x, f, converged)
 
-        converged |= np.abs(f) < max_abs_error
+        converged = np.abs(f) < max_abs_error
 
         if np.all(converged):
             return x
diff --git a/named_arrays/_vectors/vector_named_array_functions.py b/named_arrays/_vectors/vector_named_array_functions.py
index 23b1a75..4768dc7 100644
--- a/named_arrays/_vectors/vector_named_array_functions.py
+++ b/named_arrays/_vectors/vector_named_array_functions.py
@@ -450,7 +450,7 @@ def optimize_root_newton(
         if callback is not None:
             callback(i, x, f, converged)
 
-        converged |= np.abs(f) < max_abs_error
+        converged = np.abs(f) < max_abs_error
 
         if np.all(converged):
             return x
@@ -555,7 +555,7 @@ def optimize_minimum_gradient_descent(
 
         grad = gradient(x)
 
-        converged |= np.abs(grad) < min_gradient
+        converged = np.abs(grad) < min_gradient
 
         if np.all(converged):
             return x

From 6e39ca76ba51174f85a8190528f89dbbdebf86f2 Mon Sep 17 00:00:00 2001
From: Roy Smart <roytsmart@gmail.com>
Date: Sun, 25 Aug 2024 16:36:20 -0600
Subject: [PATCH 2/8] assymmetric test function

---
 named_arrays/_vectors/tests/test_vectors.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/named_arrays/_vectors/tests/test_vectors.py b/named_arrays/_vectors/tests/test_vectors.py
index 3f19bc4..dbd1069 100644
--- a/named_arrays/_vectors/tests/test_vectors.py
+++ b/named_arrays/_vectors/tests/test_vectors.py
@@ -578,7 +578,11 @@ class TestJacobian(
         @pytest.mark.parametrize(
             argnames="function",
             argvalues=[
-                lambda x: np.square(na.value(x) - shift_horizontal) + shift_vertical
+                lambda x: np.where(
+                    na.value(x) - shift_horizontal < 0,
+                    np.square(1 * (na.value(x) - shift_horizontal)) + shift_vertical,
+                    np.square(2 * (na.value(x) - shift_horizontal)) + shift_vertical,
+                )
                 for shift_horizontal in [20,]
                 for shift_vertical in [-1]
             ],

From fe1ba88c2840742d32c1d5974a94c6ffa5fec622 Mon Sep 17 00:00:00 2001
From: Roy Smart <roytsmart@gmail.com>
Date: Sun, 25 Aug 2024 16:40:56 -0600
Subject: [PATCH 3/8] revert last commit

---
 named_arrays/_vectors/tests/test_vectors.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/named_arrays/_vectors/tests/test_vectors.py b/named_arrays/_vectors/tests/test_vectors.py
index dbd1069..3f19bc4 100644
--- a/named_arrays/_vectors/tests/test_vectors.py
+++ b/named_arrays/_vectors/tests/test_vectors.py
@@ -578,11 +578,7 @@ class TestJacobian(
         @pytest.mark.parametrize(
             argnames="function",
             argvalues=[
-                lambda x: np.where(
-                    na.value(x) - shift_horizontal < 0,
-                    np.square(1 * (na.value(x) - shift_horizontal)) + shift_vertical,
-                    np.square(2 * (na.value(x) - shift_horizontal)) + shift_vertical,
-                )
+                lambda x: np.square(na.value(x) - shift_horizontal) + shift_vertical
                 for shift_horizontal in [20,]
                 for shift_vertical in [-1]
             ],

From ac551abfd2535d67df27c1dff9fefb77291a4cc7 Mon Sep 17 00:00:00 2001
From: Roy Smart <roytsmart@gmail.com>
Date: Sun, 25 Aug 2024 17:27:10 -0600
Subject: [PATCH 4/8] check units

---
 named_arrays/_vectors/tests/test_vectors.py | 5 ++++-
 named_arrays/tests/test_core.py             | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/named_arrays/_vectors/tests/test_vectors.py b/named_arrays/_vectors/tests/test_vectors.py
index 3f19bc4..369fb13 100644
--- a/named_arrays/_vectors/tests/test_vectors.py
+++ b/named_arrays/_vectors/tests/test_vectors.py
@@ -597,7 +597,10 @@ class TestOptimizeRoot(
         @pytest.mark.parametrize(
             argnames="function,expected",
             argvalues=[
-                (lambda x: (np.square(na.value(x) - shift_horizontal) + shift_vertical).length, shift_horizontal)
+                (
+                    lambda x: (np.square(na.value(x) - shift_horizontal) + shift_vertical).length * u.ph,
+                    shift_horizontal,
+                )
                 for shift_horizontal in [20,]
                 for shift_vertical in [1,]
             ]
diff --git a/named_arrays/tests/test_core.py b/named_arrays/tests/test_core.py
index 3a9a5e5..2ec1b80 100644
--- a/named_arrays/tests/test_core.py
+++ b/named_arrays/tests/test_core.py
@@ -1471,7 +1471,7 @@ def callback(i, x, f, c):
                     callback=callback,
                 )
 
-                assert np.allclose(na.value(result), expected)
+                assert np.allclose(result, expected * na.unit_normalized(array))
                 assert out is result
 
         @pytest.mark.parametrize(

From 4388d8fbd707f960b9b831809c3087b2f165bb4f Mon Sep 17 00:00:00 2001
From: Roy Smart <roytsmart@gmail.com>
Date: Sun, 25 Aug 2024 22:03:19 -0600
Subject: [PATCH 5/8] Added momentum

---
 docs/refs.bib                                 | 10 +++++--
 named_arrays/_vectors/tests/test_vectors.py   |  4 +--
 .../_vectors/vector_named_array_functions.py  |  6 +++-
 named_arrays/optimize.py                      | 29 +++++++++++++++++--
 4 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/docs/refs.bib b/docs/refs.bib
index 713bb3d..1b20a6a 100644
--- a/docs/refs.bib
+++ b/docs/refs.bib
@@ -11,5 +11,11 @@ @article{Eriksson1990
     URL = {https://doi.org/10.1080/0025570X.1990.11977515},
     eprint = {https://doi.org/10.1080/0025570X.1990.11977515}
 }
-
-
+@article{Goh2017,
+  author = {Goh, Gabriel},
+  title = {Why Momentum Really Works},
+  journal = {Distill},
+  year = {2017},
+  url = {http://distill.pub/2017/momentum},
+  doi = {10.23915/distill.00006}
+}
diff --git a/named_arrays/_vectors/tests/test_vectors.py b/named_arrays/_vectors/tests/test_vectors.py
index 369fb13..e1afc8d 100644
--- a/named_arrays/_vectors/tests/test_vectors.py
+++ b/named_arrays/_vectors/tests/test_vectors.py
@@ -598,10 +598,10 @@ class TestOptimizeRoot(
             argnames="function,expected",
             argvalues=[
                 (
-                    lambda x: (np.square(na.value(x) - shift_horizontal) + shift_vertical).length * u.ph,
+                    lambda x: (np.square((na.value(x) - shift_horizontal).length) + shift_vertical) * u.ph,
                     shift_horizontal,
                 )
-                for shift_horizontal in [20,]
+                for shift_horizontal in [2,]
                 for shift_vertical in [1,]
             ]
         )
diff --git a/named_arrays/_vectors/vector_named_array_functions.py b/named_arrays/_vectors/vector_named_array_functions.py
index 4768dc7..4bbc7c5 100644
--- a/named_arrays/_vectors/vector_named_array_functions.py
+++ b/named_arrays/_vectors/vector_named_array_functions.py
@@ -518,6 +518,7 @@ def optimize_minimum_gradient_descent(
     function: Callable[[na.AbstractVectorArray], na.AbstractScalar],
     guess: na.AbstractVectorArray,
     step_size: float | na.AbstractScalar,
+    momentum: float | na.AbstractScalar,
     gradient: None | Callable[[na.AbstractVectorArray], na.AbstractScalar],
     min_gradient: na.ScalarLike,
     max_iterations: int,
@@ -547,6 +548,7 @@ def optimize_minimum_gradient_descent(
     converged = na.broadcast_to(0 * na.value(x), shape=shape).astype(bool)
 
     x = na.broadcast_to(x, shape).astype(float)
+    z = 0
 
     for i in range(max_iterations):
 
@@ -560,7 +562,9 @@ def optimize_minimum_gradient_descent(
         if np.all(converged):
             return x
 
-        correction = step_size * grad
+        z = momentum * z + grad
+
+        correction = step_size * z
 
         x = x - correction
 
diff --git a/named_arrays/optimize.py b/named_arrays/optimize.py
index e710aa5..315d5ce 100644
--- a/named_arrays/optimize.py
+++ b/named_arrays/optimize.py
@@ -142,12 +142,13 @@ def minimum_gradient_descent(
     function: Callable[[InputT], OutputT],
     guess: InputT,
     step_size: None | InputT = None,
+    momentum: float | OutputT = 0,
     gradient: None | Callable[[InputT], InputT] = None,
     min_gradient: None | InputT = None,
     max_iterations: int = 1000,
     callback: None | Callable[[int, InputT, OutputT, na.AbstractArray], None] = None,
 ) -> InputT:
-    """
+    r"""
     Find the local minimum of the given function using the
     `gradient descent <https://en.wikipedia.org/wiki/Gradient_descent>`_ method.
 
@@ -161,7 +162,12 @@ def minimum_gradient_descent(
         The learning rate for the gradient descent algorithm.
         This should have the same units as ``x / gradient(x)``.
         If :obj:`None` (the default), this takes the value
-        ``0.1 * na.unit(x / gradient(x))``.
+        ``0.01 * na.unit(x / gradient(x))``.
+    momentum
+        The momentum constant, :math:`\beta` for the gradient descent algorithm.
+        Should be a dimensionless number between zero and one.
+        Defaults to zero, which equivalent to vanilla gradient descent with
+        no momentum.
     gradient
         The gradient of `function`.
         If :obj:`None` (the default), the gradient is computed using
@@ -180,6 +186,22 @@ def minimum_gradient_descent(
         ``x`` is the current guess, ``f`` is the current function value,
         and ``converged`` is an array storing the convergence state for every
         minimum being computed.
+
+    Notes
+    -----
+
+    This function uses the update rules described in :cite:t:`Goh2017`,
+
+    .. math::
+
+        z_{k + 1} = \beta z_k + \grad f(x_k) \\
+
+        x_{k + 1} = x_k - \alpha z_k,
+
+    where :math:`x_k` is the current guess for iteration :math:`k`,
+    :math:`f` is the objective function,
+    :math:`\alpha` is the learning rate,
+    and :math:`\beta` is the momentum constant.
     """
 
     x = guess
@@ -191,7 +213,7 @@ def minimum_gradient_descent(
     unit_grad = unit_f / unit_x
 
     if step_size is None:
-        step_size = 0.1 * (unit_x / unit_grad)
+        step_size = 0.01 * (unit_x / unit_grad)
 
     if gradient is None:
         def gradient(x: float | na.AbstractScalar | na.AbstractVectorArray):
@@ -209,6 +231,7 @@ def gradient(x: float | na.AbstractScalar | na.AbstractVectorArray):
         function=function,
         guess=guess,
         step_size=step_size,
+        momentum=momentum,
         gradient=gradient,
         min_gradient=min_gradient,
         max_iterations=max_iterations,

From 7c7a8af73ef751d8749b2a499d030662aa695705 Mon Sep 17 00:00:00 2001
From: Roy Smart <roytsmart@gmail.com>
Date: Sun, 25 Aug 2024 22:05:51 -0600
Subject: [PATCH 6/8] Use momentum in tests

---
 named_arrays/tests/test_core.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/named_arrays/tests/test_core.py b/named_arrays/tests/test_core.py
index 2ec1b80..8b9ded1 100644
--- a/named_arrays/tests/test_core.py
+++ b/named_arrays/tests/test_core.py
@@ -1469,6 +1469,7 @@ def callback(i, x, f, c):
                     function=function,
                     guess=array,
                     callback=callback,
+                    momentum=0.5,
                 )
 
                 assert np.allclose(result, expected * na.unit_normalized(array))

From 1d1f6134d56b5f1b79f212c857db67ab489d9163 Mon Sep 17 00:00:00 2001
From: Roy Smart <roytsmart@gmail.com>
Date: Sun, 25 Aug 2024 22:30:50 -0600
Subject: [PATCH 7/8] docs

---
 named_arrays/optimize.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/named_arrays/optimize.py b/named_arrays/optimize.py
index 315d5ce..f7845d8 100644
--- a/named_arrays/optimize.py
+++ b/named_arrays/optimize.py
@@ -193,8 +193,12 @@ def minimum_gradient_descent(
     This function uses the update rules described in :cite:t:`Goh2017`,
 
     .. math::
+        :label: momentum-equation
 
-        z_{k + 1} = \beta z_k + \grad f(x_k) \\
+        z_{k + 1} = \beta z_k + \del f(x_k)
+
+    .. math::
+        :label: gradient-descent
 
         x_{k + 1} = x_k - \alpha z_k,
 

From 5fd781a5bb411a9cedf55bdfc514765c0403c5e0 Mon Sep 17 00:00:00 2001
From: Roy Smart <roytsmart@gmail.com>
Date: Sun, 25 Aug 2024 22:48:23 -0600
Subject: [PATCH 8/8] docs

---
 named_arrays/optimize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/named_arrays/optimize.py b/named_arrays/optimize.py
index f7845d8..d6fc6ef 100644
--- a/named_arrays/optimize.py
+++ b/named_arrays/optimize.py
@@ -195,7 +195,7 @@ def minimum_gradient_descent(
     .. math::
         :label: momentum-equation
 
-        z_{k + 1} = \beta z_k + \del f(x_k)
+        z_{k + 1} = \beta z_k + \nabla f(x_k)
 
     .. math::
         :label: gradient-descent