sun-data · byrdie · Aug 26, 2024 · Aug 25, 2024 · Aug 25, 2024 · Aug 25, 2024
diff --git a/docs/refs.bib b/docs/refs.bib
@@ -11,5 +11,11 @@ @article{Eriksson1990
     URL = {https://doi.org/10.1080/0025570X.1990.11977515},
     eprint = {https://doi.org/10.1080/0025570X.1990.11977515}
 }
-
-
+@article{Goh2017,
+  author = {Goh, Gabriel},
+  title = {Why Momentum Really Works},
+  journal = {Distill},
+  year = {2017},
+  url = {http://distill.pub/2017/momentum},
+  doi = {10.23915/distill.00006}
+}
diff --git a/named_arrays/_scalars/scalar_named_array_functions.py b/named_arrays/_scalars/scalar_named_array_functions.py
@@ -917,7 +917,7 @@ def optimize_root_newton(
         if callback is not None:
             callback(i, x, f, converged)
 
-        converged |= np.abs(f) < max_abs_error
+        converged = np.abs(f) < max_abs_error
 
         if np.all(converged):
             return x

diff --git a/named_arrays/_vectors/tests/test_vectors.py b/named_arrays/_vectors/tests/test_vectors.py
@@ -597,8 +597,11 @@ class TestOptimizeRoot(
         @pytest.mark.parametrize(
             argnames="function,expected",
             argvalues=[
-                (lambda x: (np.square(na.value(x) - shift_horizontal) + shift_vertical).length, shift_horizontal)
-                for shift_horizontal in [20,]
+                (
+                    lambda x: (np.square((na.value(x) - shift_horizontal).length) + shift_vertical) * u.ph,
+                    shift_horizontal,
+                )
+                for shift_horizontal in [2,]
                 for shift_vertical in [1,]
             ]
         )

diff --git a/named_arrays/_vectors/vector_named_array_functions.py b/named_arrays/_vectors/vector_named_array_functions.py
@@ -450,7 +450,7 @@
         if callback is not None:
             callback(i, x, f, converged)
 
-        converged |= np.abs(f) < max_abs_error
+        converged = np.abs(f) < max_abs_error
 
         if np.all(converged):
             return x
@@ -518,6 +518,7 @@
     function: Callable[[na.AbstractVectorArray], na.AbstractScalar],
     guess: na.AbstractVectorArray,
     step_size: float | na.AbstractScalar,
+    momentum: float | na.AbstractScalar,
     gradient: None | Callable[[na.AbstractVectorArray], na.AbstractScalar],
     min_gradient: na.ScalarLike,
     max_iterations: int,
@@ -547,6 +548,7 @@
     converged = na.broadcast_to(0 * na.value(x), shape=shape).astype(bool)
 
     x = na.broadcast_to(x, shape).astype(float)
+    z = 0
 
     for i in range(max_iterations):
 
@@ -555,12 +557,14 @@
 
         grad = gradient(x)
 
-        converged |= np.abs(grad) < min_gradient
+        converged = np.abs(grad) < min_gradient
 
         if np.all(converged):
             return x
 
-        correction = step_size * grad
+        z = momentum * z + grad
+
+        correction = step_size * z
 
         x = x - correction
 

diff --git a/named_arrays/optimize.py b/named_arrays/optimize.py
@@ -142,12 +142,13 @@
     function: Callable[[InputT], OutputT],
     guess: InputT,
     step_size: None | InputT = None,
+    momentum: float | OutputT = 0,
     gradient: None | Callable[[InputT], InputT] = None,
     min_gradient: None | InputT = None,
     max_iterations: int = 1000,
     callback: None | Callable[[int, InputT, OutputT, na.AbstractArray], None] = None,
 ) -> InputT:
-    """
+    r"""
     Find the local minimum of the given function using the
     `gradient descent <https://en.wikipedia.org/wiki/Gradient_descent>`_ method.
 
@@ -161,7 +162,12 @@
         The learning rate for the gradient descent algorithm.
         This should have the same units as ``x / gradient(x)``.
         If :obj:`None` (the default), this takes the value
-        ``0.1 * na.unit(x / gradient(x))``.
+        ``0.01 * na.unit(x / gradient(x))``.
+    momentum
+        The momentum constant, :math:`\beta` for the gradient descent algorithm.
+        Should be a dimensionless number between zero and one.
+        Defaults to zero, which equivalent to vanilla gradient descent with
+        no momentum.
     gradient
         The gradient of `function`.
         If :obj:`None` (the default), the gradient is computed using
@@ -180,6 +186,26 @@
         ``x`` is the current guess, ``f`` is the current function value,
         and ``converged`` is an array storing the convergence state for every
         minimum being computed.
+
+    Notes
+    -----
+
+    This function uses the update rules described in :cite:t:`Goh2017`,
+
+    .. math::
+        :label: momentum-equation
+
+        z_{k + 1} = \beta z_k + \nabla f(x_k)
+
+    .. math::
+        :label: gradient-descent
+
+        x_{k + 1} = x_k - \alpha z_k,
+
+    where :math:`x_k` is the current guess for iteration :math:`k`,
+    :math:`f` is the objective function,
+    :math:`\alpha` is the learning rate,
+    and :math:`\beta` is the momentum constant.
     """
 
     x = guess
@@ -191,7 +217,7 @@
     unit_grad = unit_f / unit_x
 
     if step_size is None:
-        step_size = 0.1 * (unit_x / unit_grad)
+        step_size = 0.01 * (unit_x / unit_grad)
 
     if gradient is None:
         def gradient(x: float | na.AbstractScalar | na.AbstractVectorArray):
@@ -209,6 +235,7 @@
         function=function,
         guess=guess,
         step_size=step_size,
+        momentum=momentum,
         gradient=gradient,
         min_gradient=min_gradient,
         max_iterations=max_iterations,

diff --git a/named_arrays/tests/test_core.py b/named_arrays/tests/test_core.py
@@ -1469,9 +1469,10 @@
                     function=function,
                     guess=array,
                     callback=callback,
+                    momentum=0.5,
                 )
 
-                assert np.allclose(na.value(result), expected)
+                assert np.allclose(result, expected * na.unit_normalized(array))
                 assert out is result
 
         @pytest.mark.parametrize(