From b0ed54c5bb38a3a8be3ff7576a89c3e021b80155 Mon Sep 17 00:00:00 2001 From: Roy Smart Date: Sun, 25 Aug 2024 14:00:51 -0600 Subject: [PATCH 1/8] Fixed bug in convergence criterion for `named_arrays.optimize` functions. --- named_arrays/_scalars/scalar_named_array_functions.py | 2 +- named_arrays/_vectors/vector_named_array_functions.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/named_arrays/_scalars/scalar_named_array_functions.py b/named_arrays/_scalars/scalar_named_array_functions.py index b3bb888..d0d6edc 100644 --- a/named_arrays/_scalars/scalar_named_array_functions.py +++ b/named_arrays/_scalars/scalar_named_array_functions.py @@ -917,7 +917,7 @@ def optimize_root_newton( if callback is not None: callback(i, x, f, converged) - converged |= np.abs(f) < max_abs_error + converged = np.abs(f) < max_abs_error if np.all(converged): return x diff --git a/named_arrays/_vectors/vector_named_array_functions.py b/named_arrays/_vectors/vector_named_array_functions.py index 23b1a75..4768dc7 100644 --- a/named_arrays/_vectors/vector_named_array_functions.py +++ b/named_arrays/_vectors/vector_named_array_functions.py @@ -450,7 +450,7 @@ def optimize_root_newton( if callback is not None: callback(i, x, f, converged) - converged |= np.abs(f) < max_abs_error + converged = np.abs(f) < max_abs_error if np.all(converged): return x @@ -555,7 +555,7 @@ def optimize_minimum_gradient_descent( grad = gradient(x) - converged |= np.abs(grad) < min_gradient + converged = np.abs(grad) < min_gradient if np.all(converged): return x From 6e39ca76ba51174f85a8190528f89dbbdebf86f2 Mon Sep 17 00:00:00 2001 From: Roy Smart Date: Sun, 25 Aug 2024 16:36:20 -0600 Subject: [PATCH 2/8] assymmetric test function --- named_arrays/_vectors/tests/test_vectors.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/named_arrays/_vectors/tests/test_vectors.py b/named_arrays/_vectors/tests/test_vectors.py index 3f19bc4..dbd1069 100644 --- a/named_arrays/_vectors/tests/test_vectors.py +++ b/named_arrays/_vectors/tests/test_vectors.py @@ -578,7 +578,11 @@ class TestJacobian( @pytest.mark.parametrize( argnames="function", argvalues=[ - lambda x: np.square(na.value(x) - shift_horizontal) + shift_vertical + lambda x: np.where( + na.value(x) - shift_horizontal < 0, + np.square(1 * (na.value(x) - shift_horizontal)) + shift_vertical, + np.square(2 * (na.value(x) - shift_horizontal)) + shift_vertical, + ) for shift_horizontal in [20,] for shift_vertical in [-1] ], From fe1ba88c2840742d32c1d5974a94c6ffa5fec622 Mon Sep 17 00:00:00 2001 From: Roy Smart Date: Sun, 25 Aug 2024 16:40:56 -0600 Subject: [PATCH 3/8] revert last commit --- named_arrays/_vectors/tests/test_vectors.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/named_arrays/_vectors/tests/test_vectors.py b/named_arrays/_vectors/tests/test_vectors.py index dbd1069..3f19bc4 100644 --- a/named_arrays/_vectors/tests/test_vectors.py +++ b/named_arrays/_vectors/tests/test_vectors.py @@ -578,11 +578,7 @@ class TestJacobian( @pytest.mark.parametrize( argnames="function", argvalues=[ - lambda x: np.where( - na.value(x) - shift_horizontal < 0, - np.square(1 * (na.value(x) - shift_horizontal)) + shift_vertical, - np.square(2 * (na.value(x) - shift_horizontal)) + shift_vertical, - ) + lambda x: np.square(na.value(x) - shift_horizontal) + shift_vertical for shift_horizontal in [20,] for shift_vertical in [-1] ], From ac551abfd2535d67df27c1dff9fefb77291a4cc7 Mon Sep 17 00:00:00 2001 From: Roy Smart Date: Sun, 25 Aug 2024 17:27:10 -0600 Subject: [PATCH 4/8] check units --- named_arrays/_vectors/tests/test_vectors.py | 5 ++++- named_arrays/tests/test_core.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/named_arrays/_vectors/tests/test_vectors.py b/named_arrays/_vectors/tests/test_vectors.py index 3f19bc4..369fb13 100644 --- a/named_arrays/_vectors/tests/test_vectors.py +++ b/named_arrays/_vectors/tests/test_vectors.py @@ -597,7 +597,10 @@ class TestOptimizeRoot( @pytest.mark.parametrize( argnames="function,expected", argvalues=[ - (lambda x: (np.square(na.value(x) - shift_horizontal) + shift_vertical).length, shift_horizontal) + ( + lambda x: (np.square(na.value(x) - shift_horizontal) + shift_vertical).length * u.ph, + shift_horizontal, + ) for shift_horizontal in [20,] for shift_vertical in [1,] ] diff --git a/named_arrays/tests/test_core.py b/named_arrays/tests/test_core.py index 3a9a5e5..2ec1b80 100644 --- a/named_arrays/tests/test_core.py +++ b/named_arrays/tests/test_core.py @@ -1471,7 +1471,7 @@ def callback(i, x, f, c): callback=callback, ) - assert np.allclose(na.value(result), expected) + assert np.allclose(result, expected * na.unit_normalized(array)) assert out is result @pytest.mark.parametrize( From 4388d8fbd707f960b9b831809c3087b2f165bb4f Mon Sep 17 00:00:00 2001 From: Roy Smart Date: Sun, 25 Aug 2024 22:03:19 -0600 Subject: [PATCH 5/8] Added momentum --- docs/refs.bib | 10 +++++-- named_arrays/_vectors/tests/test_vectors.py | 4 +-- .../_vectors/vector_named_array_functions.py | 6 +++- named_arrays/optimize.py | 29 +++++++++++++++++-- 4 files changed, 41 insertions(+), 8 deletions(-) diff --git a/docs/refs.bib b/docs/refs.bib index 713bb3d..1b20a6a 100644 --- a/docs/refs.bib +++ b/docs/refs.bib @@ -11,5 +11,11 @@ @article{Eriksson1990 URL = {https://doi.org/10.1080/0025570X.1990.11977515}, eprint = {https://doi.org/10.1080/0025570X.1990.11977515} } - - +@article{Goh2017, + author = {Goh, Gabriel}, + title = {Why Momentum Really Works}, + journal = {Distill}, + year = {2017}, + url = {http://distill.pub/2017/momentum}, + doi = {10.23915/distill.00006} +} diff --git a/named_arrays/_vectors/tests/test_vectors.py b/named_arrays/_vectors/tests/test_vectors.py index 369fb13..e1afc8d 100644 --- a/named_arrays/_vectors/tests/test_vectors.py +++ b/named_arrays/_vectors/tests/test_vectors.py @@ -598,10 +598,10 @@ class TestOptimizeRoot( argnames="function,expected", argvalues=[ ( - lambda x: (np.square(na.value(x) - shift_horizontal) + shift_vertical).length * u.ph, + lambda x: (np.square((na.value(x) - shift_horizontal).length) + shift_vertical) * u.ph, shift_horizontal, ) - for shift_horizontal in [20,] + for shift_horizontal in [2,] for shift_vertical in [1,] ] ) diff --git a/named_arrays/_vectors/vector_named_array_functions.py b/named_arrays/_vectors/vector_named_array_functions.py index 4768dc7..4bbc7c5 100644 --- a/named_arrays/_vectors/vector_named_array_functions.py +++ b/named_arrays/_vectors/vector_named_array_functions.py @@ -518,6 +518,7 @@ def optimize_minimum_gradient_descent( function: Callable[[na.AbstractVectorArray], na.AbstractScalar], guess: na.AbstractVectorArray, step_size: float | na.AbstractScalar, + momentum: float | na.AbstractScalar, gradient: None | Callable[[na.AbstractVectorArray], na.AbstractScalar], min_gradient: na.ScalarLike, max_iterations: int, @@ -547,6 +548,7 @@ def optimize_minimum_gradient_descent( converged = na.broadcast_to(0 * na.value(x), shape=shape).astype(bool) x = na.broadcast_to(x, shape).astype(float) + z = 0 for i in range(max_iterations): @@ -560,7 +562,9 @@ def optimize_minimum_gradient_descent( if np.all(converged): return x - correction = step_size * grad + z = momentum * z + grad + + correction = step_size * z x = x - correction diff --git a/named_arrays/optimize.py b/named_arrays/optimize.py index e710aa5..315d5ce 100644 --- a/named_arrays/optimize.py +++ b/named_arrays/optimize.py @@ -142,12 +142,13 @@ def minimum_gradient_descent( function: Callable[[InputT], OutputT], guess: InputT, step_size: None | InputT = None, + momentum: float | OutputT = 0, gradient: None | Callable[[InputT], InputT] = None, min_gradient: None | InputT = None, max_iterations: int = 1000, callback: None | Callable[[int, InputT, OutputT, na.AbstractArray], None] = None, ) -> InputT: - """ + r""" Find the local minimum of the given function using the `gradient descent `_ method. @@ -161,7 +162,12 @@ def minimum_gradient_descent( The learning rate for the gradient descent algorithm. This should have the same units as ``x / gradient(x)``. If :obj:`None` (the default), this takes the value - ``0.1 * na.unit(x / gradient(x))``. + ``0.01 * na.unit(x / gradient(x))``. + momentum + The momentum constant, :math:`\beta` for the gradient descent algorithm. + Should be a dimensionless number between zero and one. + Defaults to zero, which equivalent to vanilla gradient descent with + no momentum. gradient The gradient of `function`. If :obj:`None` (the default), the gradient is computed using @@ -180,6 +186,22 @@ def minimum_gradient_descent( ``x`` is the current guess, ``f`` is the current function value, and ``converged`` is an array storing the convergence state for every minimum being computed. + + Notes + ----- + + This function uses the update rules described in :cite:t:`Goh2017`, + + .. math:: + + z_{k + 1} = \beta z_k + \grad f(x_k) \\ + + x_{k + 1} = x_k - \alpha z_k, + + where :math:`x_k` is the current guess for iteration :math:`k`, + :math:`f` is the objective function, + :math:`\alpha` is the learning rate, + and :math:`\beta` is the momentum constant. """ x = guess @@ -191,7 +213,7 @@ def minimum_gradient_descent( unit_grad = unit_f / unit_x if step_size is None: - step_size = 0.1 * (unit_x / unit_grad) + step_size = 0.01 * (unit_x / unit_grad) if gradient is None: def gradient(x: float | na.AbstractScalar | na.AbstractVectorArray): @@ -209,6 +231,7 @@ def gradient(x: float | na.AbstractScalar | na.AbstractVectorArray): function=function, guess=guess, step_size=step_size, + momentum=momentum, gradient=gradient, min_gradient=min_gradient, max_iterations=max_iterations, From 7c7a8af73ef751d8749b2a499d030662aa695705 Mon Sep 17 00:00:00 2001 From: Roy Smart Date: Sun, 25 Aug 2024 22:05:51 -0600 Subject: [PATCH 6/8] Use momentum in tests --- named_arrays/tests/test_core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/named_arrays/tests/test_core.py b/named_arrays/tests/test_core.py index 2ec1b80..8b9ded1 100644 --- a/named_arrays/tests/test_core.py +++ b/named_arrays/tests/test_core.py @@ -1469,6 +1469,7 @@ def callback(i, x, f, c): function=function, guess=array, callback=callback, + momentum=0.5, ) assert np.allclose(result, expected * na.unit_normalized(array)) From 1d1f6134d56b5f1b79f212c857db67ab489d9163 Mon Sep 17 00:00:00 2001 From: Roy Smart Date: Sun, 25 Aug 2024 22:30:50 -0600 Subject: [PATCH 7/8] docs --- named_arrays/optimize.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/named_arrays/optimize.py b/named_arrays/optimize.py index 315d5ce..f7845d8 100644 --- a/named_arrays/optimize.py +++ b/named_arrays/optimize.py @@ -193,8 +193,12 @@ def minimum_gradient_descent( This function uses the update rules described in :cite:t:`Goh2017`, .. math:: + :label: momentum-equation - z_{k + 1} = \beta z_k + \grad f(x_k) \\ + z_{k + 1} = \beta z_k + \del f(x_k) + + .. math:: + :label: gradient-descent x_{k + 1} = x_k - \alpha z_k, From 5fd781a5bb411a9cedf55bdfc514765c0403c5e0 Mon Sep 17 00:00:00 2001 From: Roy Smart Date: Sun, 25 Aug 2024 22:48:23 -0600 Subject: [PATCH 8/8] docs --- named_arrays/optimize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/named_arrays/optimize.py b/named_arrays/optimize.py index f7845d8..d6fc6ef 100644 --- a/named_arrays/optimize.py +++ b/named_arrays/optimize.py @@ -195,7 +195,7 @@ def minimum_gradient_descent( .. math:: :label: momentum-equation - z_{k + 1} = \beta z_k + \del f(x_k) + z_{k + 1} = \beta z_k + \nabla f(x_k) .. math:: :label: gradient-descent